rf_paritylog.c revision 1.1
1/*	$NetBSD: rf_paritylog.c,v 1.1 1998/11/13 04:20:31 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/* Code for manipulating in-core parity logs
30 *
31 * :
32 * Log: rf_paritylog.c,v
33 * Revision 1.27  1996/07/28 20:31:39  jimz
34 * i386netbsd port
35 * true/false fixup
36 *
37 * Revision 1.26  1996/07/27  23:36:08  jimz
38 * Solaris port of simulator
39 *
40 * Revision 1.25  1996/07/17  21:00:58  jimz
41 * clean up timer interface, tracing
42 *
43 * Revision 1.24  1996/06/11  10:18:59  jimz
44 * AllocParityLogCommonData() was freeing the common pointer immediately
45 * after allocating this. It appeared that this free really belonged
46 * inside one of the failure cases (for backing out), so I moved it
47 * in there.
48 *
49 * Revision 1.23  1996/06/05  18:06:02  jimz
50 * Major code cleanup. The Great Renaming is now done.
51 * Better modularity. Better typing. Fixed a bunch of
52 * synchronization bugs. Made a lot of global stuff
53 * per-desc or per-array. Removed dead code.
54 *
55 * Revision 1.22  1996/06/02  17:31:48  jimz
56 * Moved a lot of global stuff into array structure, where it belongs.
57 * Fixed up paritylogging, pss modules in this manner. Some general
58 * code cleanup. Removed lots of dead code, some dead files.
59 *
60 * Revision 1.21  1996/05/31  22:26:54  jimz
61 * fix a lot of mapping problems, memory allocation problems
62 * found some weird lock issues, fixed 'em
63 * more code cleanup
64 *
65 * Revision 1.20  1996/05/30  23:22:16  jimz
66 * bugfixes of serialization, timing problems
67 * more cleanup
68 *
69 * Revision 1.19  1996/05/30  12:59:18  jimz
70 * make etimer happier, more portable
71 *
72 * Revision 1.18  1996/05/27  18:56:37  jimz
73 * more code cleanup
74 * better typing
75 * compiles in all 3 environments
76 *
77 * Revision 1.17  1996/05/24  04:28:55  jimz
78 * release cleanup ckpt
79 *
80 * Revision 1.16  1996/05/23  21:46:35  jimz
81 * checkpoint in code cleanup (release prep)
82 * lots of types, function names have been fixed
83 *
84 * Revision 1.15  1996/05/23  00:33:23  jimz
85 * code cleanup: move all debug decls to rf_options.c, all extern
86 * debug decls to rf_options.h, all debug vars preceded by rf_
87 *
88 * Revision 1.14  1996/05/20  16:16:59  jimz
89 * switch to rf_{mutex,cond}_{init,destroy}
90 *
91 * Revision 1.13  1996/05/18  19:51:34  jimz
92 * major code cleanup- fix syntax, make some types consistent,
93 * add prototypes, clean out dead code, et cetera
94 *
95 * Revision 1.12  1995/12/12  18:10:06  jimz
96 * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
97 * fix 80-column brain damage in comments
98 *
99 * Revision 1.11  1995/12/06  20:54:44  wvcii
100 * added prototyping
101 *
102 * Revision 1.10  1995/11/30  16:05:37  wvcii
103 * added copyright info
104 *
105 * Revision 1.9  1995/10/08  20:41:28  wvcii
106 * fixed bug in allocation of CommonLogData (was allocating incorrect size)
107 *
108 * Revision 1.8  1995/09/07  15:52:12  jimz
109 * noop compile when INCLUDE_PARITYLOGGING not defined
110 *
111 * Revision 1.7  1995/09/06  19:17:36  wvcii
112 * moved code for reintegration to rf_paritylogDiskMgr.c
113 *
114 * Revision 1.6  95/07/07  00:16:06  wvcii
115 * this version free from deadlock, fails parity verification
116 *
117 * Revision 1.5  1995/06/09  13:14:24  wvcii
118 * code is now nonblocking
119 *
120 * Revision 1.4  95/06/01  17:01:59  wvcii
121 * code debug
122 *
123 * Revision 1.3  95/05/31  13:08:23  wvcii
124 * code debug
125 *
126 * Revision 1.2  95/05/21  15:42:15  wvcii
127 * code debug
128 *
129 * Revision 1.1  95/05/18  10:43:54  wvcii
130 * Initial revision
131 *
132 */
133
134#include "rf_archs.h"
135
136#if RF_INCLUDE_PARITYLOGGING > 0
137
138/*
139 * Append-only log for recording parity "update" and "overwrite" records
140 */
141
142#include "rf_types.h"
143#include "rf_threadstuff.h"
144#include "rf_mcpair.h"
145#include "rf_raid.h"
146#include "rf_dag.h"
147#include "rf_dagfuncs.h"
148#include "rf_desc.h"
149#include "rf_layout.h"
150#include "rf_diskqueue.h"
151#include "rf_etimer.h"
152#include "rf_paritylog.h"
153#include "rf_general.h"
154#include "rf_threadid.h"
155#include "rf_map.h"
156#include "rf_paritylogging.h"
157#include "rf_paritylogDiskMgr.h"
158#include "rf_sys.h"
159
160static RF_CommonLogData_t *AllocParityLogCommonData(RF_Raid_t *raidPtr)
161{
162  RF_CommonLogData_t *common = NULL;
163  int rc;
164
165  /* Return a struct for holding common parity log information from the free
166     list (rf_parityLogDiskQueue.freeCommonList).  If the free list is empty, call
167     RF_Malloc to create a new structure.
168     NON-BLOCKING */
169
170  RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
171  if (raidPtr->parityLogDiskQueue.freeCommonList)
172    {
173      common = raidPtr->parityLogDiskQueue.freeCommonList;
174      raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next;
175      RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
176    }
177  else
178    {
179      RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
180      RF_Malloc(common, sizeof(RF_CommonLogData_t), (RF_CommonLogData_t *));
181      rc = rf_mutex_init(&common->mutex);
182      if (rc) {
183        RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
184          __LINE__, rc);
185        RF_Free(common, sizeof(RF_CommonLogData_t));
186        common = NULL;
187      }
188    }
189  common->next = NULL;
190  return(common);
191}
192
193static void FreeParityLogCommonData(RF_CommonLogData_t *common)
194{
195  RF_Raid_t *raidPtr;
196
197  /* Insert a single struct for holding parity log information
198     (data) into the free list (rf_parityLogDiskQueue.freeCommonList).
199     NON-BLOCKING */
200
201  raidPtr = common->raidPtr;
202  RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
203  common->next = raidPtr->parityLogDiskQueue.freeCommonList;
204  raidPtr->parityLogDiskQueue.freeCommonList = common;
205  RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
206}
207
208static RF_ParityLogData_t *AllocParityLogData(RF_Raid_t *raidPtr)
209{
210  RF_ParityLogData_t *data = NULL;
211
212  /* Return a struct for holding parity log information from the free
213     list (rf_parityLogDiskQueue.freeList).  If the free list is empty, call
214     RF_Malloc to create a new structure.
215     NON-BLOCKING */
216
217  RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
218  if (raidPtr->parityLogDiskQueue.freeDataList)
219    {
220      data = raidPtr->parityLogDiskQueue.freeDataList;
221      raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next;
222      RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
223    }
224  else
225    {
226      RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
227      RF_Malloc(data, sizeof(RF_ParityLogData_t), (RF_ParityLogData_t *));
228    }
229  data->next = NULL;
230  data->prev = NULL;
231  return(data);
232}
233
234
235static void FreeParityLogData(RF_ParityLogData_t *data)
236{
237  RF_ParityLogData_t *nextItem;
238  RF_Raid_t *raidPtr;
239
240  /* Insert a linked list of structs for holding parity log
241     information (data) into the free list (parityLogDiskQueue.freeList).
242     NON-BLOCKING */
243
244  raidPtr = data->common->raidPtr;
245  RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
246  while (data)
247    {
248      nextItem = data->next;
249      data->next = raidPtr->parityLogDiskQueue.freeDataList;
250      raidPtr->parityLogDiskQueue.freeDataList = data;
251      data = nextItem;
252    }
253  RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
254}
255
256
257static void EnqueueParityLogData(
258  RF_ParityLogData_t   *data,
259  RF_ParityLogData_t  **head,
260  RF_ParityLogData_t  **tail)
261{
262  RF_Raid_t *raidPtr;
263
264  /* Insert an in-core parity log (*data) into the head of
265     a disk queue (*head, *tail).
266     NON-BLOCKING */
267
268  raidPtr = data->common->raidPtr;
269  if (rf_parityLogDebug)
270    printf("[enqueueing parity log data, region %d, raidAddress %d, numSector %d]\n",data->regionID,(int)data->diskAddress.raidAddress, (int)data->diskAddress.numSector);
271  RF_ASSERT(data->prev == NULL);
272  RF_ASSERT(data->next == NULL);
273  RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
274  if (*head)
275    {
276      /* insert into head of queue */
277      RF_ASSERT((*head)->prev == NULL);
278      RF_ASSERT((*tail)->next == NULL);
279      data->next = *head;
280      (*head)->prev = data;
281      *head = data;
282    }
283  else
284    {
285      /* insert into empty list */
286      RF_ASSERT(*head == NULL);
287      RF_ASSERT(*tail == NULL);
288      *head = data;
289      *tail = data;
290    }
291  RF_ASSERT((*head)->prev == NULL);
292  RF_ASSERT((*tail)->next == NULL);
293  RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
294}
295
296static RF_ParityLogData_t *DequeueParityLogData(
297  RF_Raid_t            *raidPtr,
298  RF_ParityLogData_t  **head,
299  RF_ParityLogData_t  **tail,
300  int                   ignoreLocks)
301{
302  RF_ParityLogData_t *data;
303
304  /* Remove and return an in-core parity log from the tail of
305     a disk queue (*head, *tail).
306     NON-BLOCKING */
307
308  /* remove from tail, preserving FIFO order */
309  if (!ignoreLocks)
310    RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
311  data = *tail;
312  if (data)
313    {
314      if (*head == *tail)
315	{
316	  /* removing last item from queue */
317	  *head = NULL;
318	  *tail = NULL;
319	}
320      else
321	{
322	  *tail = (*tail)->prev;
323	  (*tail)->next = NULL;
324	  RF_ASSERT((*head)->prev == NULL);
325	  RF_ASSERT((*tail)->next == NULL);
326	}
327      data->next = NULL;
328      data->prev = NULL;
329      if (rf_parityLogDebug)
330	printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n",data->regionID,(int)data->diskAddress.raidAddress, (int)data->diskAddress.numSector);
331    }
332  if (*head)
333    {
334      RF_ASSERT((*head)->prev == NULL);
335      RF_ASSERT((*tail)->next == NULL);
336    }
337  if (!ignoreLocks)
338    RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
339  return(data);
340}
341
342
343static void RequeueParityLogData(
344  RF_ParityLogData_t   *data,
345  RF_ParityLogData_t  **head,
346  RF_ParityLogData_t  **tail)
347{
348  RF_Raid_t *raidPtr;
349
350  /* Insert an in-core parity log (*data) into the tail of
351     a disk queue (*head, *tail).
352     NON-BLOCKING */
353
354  raidPtr = data->common->raidPtr;
355  RF_ASSERT(data);
356  if (rf_parityLogDebug)
357    printf("[requeueing parity log data, region %d, raidAddress %d, numSector %d]\n",data->regionID,(int)data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
358  RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
359  if (*tail)
360    {
361      /* append to tail of list */
362      data->prev = *tail;
363      data->next = NULL;
364      (*tail)->next = data;
365      *tail = data;
366    }
367  else
368    {
369      /* inserting into an empty list */
370      *head = data;
371      *tail = data;
372      (*head)->prev = NULL;
373      (*tail)->next = NULL;
374    }
375  RF_ASSERT((*head)->prev == NULL);
376  RF_ASSERT((*tail)->next == NULL);
377  RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
378}
379
380RF_ParityLogData_t *rf_CreateParityLogData(
381  RF_ParityRecordType_t    operation,
382  RF_PhysDiskAddr_t       *pda,
383  caddr_t                  bufPtr,
384  RF_Raid_t               *raidPtr,
385  int                    (*wakeFunc)(RF_DagNode_t *node, int status),
386  void                    *wakeArg,
387  RF_AccTraceEntry_t      *tracerec,
388  RF_Etimer_t              startTime)
389{
390  RF_ParityLogData_t *data, *resultHead = NULL, *resultTail = NULL;
391  RF_CommonLogData_t *common;
392  RF_PhysDiskAddr_t *diskAddress;
393  int boundary, offset = 0;
394
395  /* Return an initialized struct of info to be logged.
396     Build one item per physical disk address, one item per region.
397
398     NON-BLOCKING */
399
400  diskAddress = pda;
401  common = AllocParityLogCommonData(raidPtr);
402  RF_ASSERT(common);
403
404  common->operation = operation;
405  common->bufPtr = bufPtr;
406  common->raidPtr = raidPtr;
407  common->wakeFunc = wakeFunc;
408  common->wakeArg = wakeArg;
409  common->tracerec = tracerec;
410  common->startTime = startTime;
411  common->cnt = 0;
412
413  if (rf_parityLogDebug)
414    printf("[entering CreateParityLogData]\n");
415  while (diskAddress)
416    {
417      common->cnt++;
418      data = AllocParityLogData(raidPtr);
419      RF_ASSERT(data);
420      data->common = common;
421      data->next = NULL;
422      data->prev = NULL;
423      data->regionID = rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector);
424      if (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + diskAddress->numSector - 1))
425	{
426	  /* disk address does not cross a region boundary */
427	  data->diskAddress = *diskAddress;
428	  data->bufOffset = offset;
429	  offset = offset + diskAddress->numSector;
430	  EnqueueParityLogData(data, &resultHead, &resultTail);
431	  /* adjust disk address */
432	  diskAddress = diskAddress->next;
433	}
434      else
435	{
436	  /* disk address crosses a region boundary */
437	  /* find address where region is crossed */
438	  boundary = 0;
439	  while (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + boundary))
440	    boundary++;
441
442	  /* enter data before the boundary */
443	  data->diskAddress = *diskAddress;
444	  data->diskAddress.numSector = boundary;
445	  data->bufOffset = offset;
446	  offset += boundary;
447	  EnqueueParityLogData(data, &resultHead, &resultTail);
448	  /* adjust disk address */
449	  diskAddress->startSector += boundary;
450	  diskAddress->numSector -= boundary;
451	}
452    }
453  if (rf_parityLogDebug)
454    printf("[leaving CreateParityLogData]\n");
455  return(resultHead);
456}
457
458
459RF_ParityLogData_t *rf_SearchAndDequeueParityLogData(
460  RF_Raid_t            *raidPtr,
461  int                   regionID,
462  RF_ParityLogData_t  **head,
463  RF_ParityLogData_t  **tail,
464  int                   ignoreLocks)
465{
466  RF_ParityLogData_t *w;
467
468  /* Remove and return an in-core parity log from a specified region (regionID).
469     If a matching log is not found, return NULL.
470
471     NON-BLOCKING.
472     */
473
474  /* walk backward through a list, looking for an entry with a matching region ID */
475  if (!ignoreLocks)
476    RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
477  w = (*tail);
478  while (w)
479    {
480      if (w->regionID == regionID)
481	{
482	  /* remove an element from the list */
483	  if (w == *tail)
484	    {
485	      if (*head == *tail)
486		{
487		  /* removing only element in the list */
488		  *head = NULL;
489		  *tail = NULL;
490		}
491	      else
492		{
493		  /* removing last item in the list */
494		  *tail = (*tail)->prev;
495		  (*tail)->next = NULL;
496		  RF_ASSERT((*head)->prev == NULL);
497		  RF_ASSERT((*tail)->next == NULL);
498		}
499	    }
500	  else
501	    {
502	      if (w == *head)
503		{
504		  /* removing first item in the list */
505		  *head = (*head)->next;
506		  (*head)->prev = NULL;
507		  RF_ASSERT((*head)->prev == NULL);
508		  RF_ASSERT((*tail)->next == NULL);
509		}
510	      else
511		{
512		  /* removing an item from the middle of the list */
513		  w->prev->next = w->next;
514		  w->next->prev = w->prev;
515		  RF_ASSERT((*head)->prev == NULL);
516		  RF_ASSERT((*tail)->next == NULL);
517		}
518	    }
519	  w->prev = NULL;
520	  w->next = NULL;
521	  if (rf_parityLogDebug)
522	    printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n",w->regionID,(int)w->diskAddress.raidAddress,(int) w->diskAddress.numSector);
523	  return(w);
524	}
525      else
526	w = w->prev;
527    }
528  if (!ignoreLocks)
529    RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
530  return(NULL);
531}
532
533static RF_ParityLogData_t *DequeueMatchingLogData(
534  RF_Raid_t            *raidPtr,
535  RF_ParityLogData_t  **head,
536  RF_ParityLogData_t  **tail)
537{
538  RF_ParityLogData_t *logDataList, *logData;
539  int regionID;
540
541  /* Remove and return an in-core parity log from the tail of
542     a disk queue (*head, *tail).  Then remove all matching
543     (identical regionIDs) logData and return as a linked list.
544
545     NON-BLOCKING
546     */
547
548  logDataList = DequeueParityLogData(raidPtr, head, tail, RF_TRUE);
549  if (logDataList)
550    {
551      regionID = logDataList->regionID;
552      logData = logDataList;
553      logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
554      while (logData->next)
555	{
556	  logData = logData->next;
557	  logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
558	}
559    }
560  return(logDataList);
561}
562
563
564static RF_ParityLog_t *AcquireParityLog(
565  RF_ParityLogData_t  *logData,
566  int                  finish)
567{
568  RF_ParityLog_t *log = NULL;
569  RF_Raid_t *raidPtr;
570
571  /* Grab a log buffer from the pool and return it.
572     If no buffers are available, return NULL.
573     NON-BLOCKING
574     */
575  raidPtr = logData->common->raidPtr;
576  RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
577  if (raidPtr->parityLogPool.parityLogs)
578    {
579      log = raidPtr->parityLogPool.parityLogs;
580      raidPtr->parityLogPool.parityLogs = raidPtr->parityLogPool.parityLogs->next;
581      log->regionID = logData->regionID;
582      log->numRecords = 0;
583      log->next = NULL;
584      raidPtr->logsInUse++;
585      RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
586    }
587  else
588    {
589      /* no logs available, so place ourselves on the queue of work waiting on log buffers
590	 this is done while parityLogPool.mutex is held, to ensure synchronization
591	 with ReleaseParityLogs.
592	 */
593      if (rf_parityLogDebug)
594	printf("[blocked on log, region %d, finish %d]\n", logData->regionID, finish);
595      if (finish)
596	RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
597      else
598	EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
599    }
600  RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
601  return(log);
602}
603
604void rf_ReleaseParityLogs(
605  RF_Raid_t       *raidPtr,
606  RF_ParityLog_t  *firstLog)
607{
608  RF_ParityLogData_t *logDataList;
609  RF_ParityLog_t *log, *lastLog;
610  int cnt;
611
612  /* Insert a linked list of parity logs (firstLog) to
613     the free list (parityLogPool.parityLogPool)
614
615     NON-BLOCKING.
616     */
617
618  RF_ASSERT(firstLog);
619
620  /* Before returning logs to global free list, service all
621     requests which are blocked on logs.  Holding mutexes for parityLogPool and parityLogDiskQueue
622     forces synchronization with AcquireParityLog().
623     */
624  RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
625  RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
626  logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
627  log = firstLog;
628  if (firstLog)
629    firstLog = firstLog->next;
630  log->numRecords = 0;
631  log->next = NULL;
632  while (logDataList && log)
633    {
634      RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
635      RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
636      rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_FALSE);
637      if (rf_parityLogDebug)
638	printf("[finishing up buf-blocked log data, region %d]\n", logDataList->regionID);
639      if (log == NULL)
640	{
641	  log = firstLog;
642	  if (firstLog)
643	    {
644	      firstLog = firstLog->next;
645	      log->numRecords = 0;
646	      log->next = NULL;
647	    }
648	}
649      RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
650      RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
651      if (log)
652	logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
653    }
654  /* return remaining logs to pool */
655  if (log)
656    {
657      log->next = firstLog;
658      firstLog = log;
659    }
660  if (firstLog)
661    {
662      lastLog = firstLog;
663      raidPtr->logsInUse--;
664      RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
665      while (lastLog->next)
666	{
667	  lastLog = lastLog->next;
668	  raidPtr->logsInUse--;
669	  RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
670	}
671      lastLog->next = raidPtr->parityLogPool.parityLogs;
672      raidPtr->parityLogPool.parityLogs = firstLog;
673      cnt = 0;
674      log = raidPtr->parityLogPool.parityLogs;
675      while (log)
676	{
677	  cnt++;
678	  log = log->next;
679	}
680      RF_ASSERT(cnt + raidPtr->logsInUse == raidPtr->numParityLogs);
681    }
682  RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
683  RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
684}
685
686static void ReintLog(
687  RF_Raid_t       *raidPtr,
688  int              regionID,
689  RF_ParityLog_t  *log)
690{
691  RF_ASSERT(log);
692
693  /* Insert an in-core parity log (log) into the disk queue of reintegration
694     work.  Set the flag (reintInProgress) for the specified region (regionID)
695     to indicate that reintegration is in progress for this region.
696     NON-BLOCKING
697     */
698
699  RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
700  raidPtr->regionInfo[regionID].reintInProgress = RF_TRUE;  /* cleared when reint complete */
701
702  if (rf_parityLogDebug)
703    printf("[requesting reintegration of region %d]\n", log->regionID);
704  /* move record to reintegration queue */
705  RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
706  log->next = raidPtr->parityLogDiskQueue.reintQueue;
707  raidPtr->parityLogDiskQueue.reintQueue = log;
708  RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
709  RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
710  RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
711}
712
713static void FlushLog(
714  RF_Raid_t       *raidPtr,
715  RF_ParityLog_t  *log)
716{
717  /* insert a core log (log) into a list of logs (parityLogDiskQueue.flushQueue)
718     waiting to be written to disk.
719     NON-BLOCKING
720     */
721
722  RF_ASSERT(log);
723  RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
724  RF_ASSERT(log->next == NULL);
725  /* move log to flush queue */
726  RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
727  log->next = raidPtr->parityLogDiskQueue.flushQueue;
728  raidPtr->parityLogDiskQueue.flushQueue = log;
729  RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
730  RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
731}
732
733static int DumpParityLogToDisk(
734  int                  finish,
735  RF_ParityLogData_t  *logData)
736{
737  int i, diskCount, regionID = logData->regionID;
738  RF_ParityLog_t *log;
739  RF_Raid_t *raidPtr;
740
741  raidPtr = logData->common->raidPtr;
742
743  /* Move a core log to disk.  If the log disk is full, initiate
744     reintegration.
745
746     Return (0) if we can enqueue the dump immediately, otherwise
747     return (1) to indicate we are blocked on reintegration and
748     control of the thread should be relinquished.
749
750     Caller must hold regionInfo[regionID].mutex
751
752     NON-BLOCKING
753     */
754
755  if (rf_parityLogDebug)
756    printf("[dumping parity log to disk, region %d]\n", regionID);
757  log = raidPtr->regionInfo[regionID].coreLog;
758  RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
759  RF_ASSERT(log->next == NULL);
760
761  /* if reintegration is in progress, must queue work */
762  RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
763  if (raidPtr->regionInfo[regionID].reintInProgress)
764    {
765      /* Can not proceed since this region is currently being reintegrated.
766	 We can not block, so queue remaining work and return */
767      if (rf_parityLogDebug)
768	printf("[region %d waiting on reintegration]\n",regionID);
769      /* XXX not sure about the use of finish - shouldn't this always be "Enqueue"? */
770      if (finish)
771	RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
772      else
773	EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
774      RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
775      return(1);  /* relenquish control of this thread */
776    }
777  RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
778  raidPtr->regionInfo[regionID].coreLog = NULL;
779  if ((raidPtr->regionInfo[regionID].diskCount) < raidPtr->regionInfo[regionID].capacity)
780    /* IMPORTANT!! this loop bound assumes region disk holds an integral number of core logs */
781    {
782      /* update disk map for this region */
783      diskCount = raidPtr->regionInfo[regionID].diskCount;
784      for (i = 0; i < raidPtr->numSectorsPerLog; i++)
785	{
786	  raidPtr->regionInfo[regionID].diskMap[i + diskCount].operation = log->records[i].operation;
787	  raidPtr->regionInfo[regionID].diskMap[i + diskCount].parityAddr = log->records[i].parityAddr;
788	}
789      log->diskOffset = diskCount;
790      raidPtr->regionInfo[regionID].diskCount += raidPtr->numSectorsPerLog;
791      FlushLog(raidPtr, log);
792    }
793  else
794    {
795      /* no room for log on disk, send it to disk manager and request reintegration */
796      RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == raidPtr->regionInfo[regionID].capacity);
797      ReintLog(raidPtr, regionID, log);
798    }
799  if (rf_parityLogDebug)
800    printf("[finished dumping parity log to disk, region %d]\n", regionID);
801  return(0);
802}
803
804int rf_ParityLogAppend(
805  RF_ParityLogData_t   *logData,
806  int                   finish,
807  RF_ParityLog_t      **incomingLog,
808  int                   clearReintFlag)
809{
810  int regionID, logItem, itemDone;
811  RF_ParityLogData_t *item;
812  int punt, done = RF_FALSE;
813  RF_ParityLog_t *log;
814  RF_Raid_t *raidPtr;
815  RF_Etimer_t timer;
816  int (*wakeFunc)(RF_DagNode_t *node, int status);
817  void *wakeArg;
818
819  /* Add parity to the appropriate log, one sector at a time.
820     This routine is called is called by dag functions ParityLogUpdateFunc
821     and ParityLogOverwriteFunc and therefore MUST BE NONBLOCKING.
822
823     Parity to be logged is contained in a linked-list (logData).  When
824     this routine returns, every sector in the list will be in one of
825     three places:
826       1) entered into the parity log
827       2) queued, waiting on reintegration
828       3) queued, waiting on a core log
829
830     Blocked work is passed to the ParityLoggingDiskManager for completion.
831     Later, as conditions which required the block are removed, the work
832     reenters this routine with the "finish" parameter set to "RF_TRUE."
833
834     NON-BLOCKING
835     */
836
837  raidPtr = logData->common->raidPtr;
838  /* lock the region for the first item in logData */
839  RF_ASSERT(logData != NULL);
840  regionID = logData->regionID;
841  RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
842  RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
843
844  if (clearReintFlag)
845    {
846      /* Enable flushing for this region.  Holding both locks provides
847	 a synchronization barrier with DumpParityLogToDisk
848	 */
849      RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
850      RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
851      RF_ASSERT(raidPtr->regionInfo[regionID].reintInProgress == RF_TRUE);
852      raidPtr->regionInfo[regionID].diskCount = 0;
853      raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
854      RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); /* flushing is now enabled */
855      RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
856    }
857
858  /* process each item in logData */
859  while (logData)
860    {
861      /* remove an item from logData */
862      item = logData;
863      logData = logData->next;
864      item->next = NULL;
865      item->prev = NULL;
866
867      if (rf_parityLogDebug)
868	printf("[appending parity log data, region %d, raidAddress %d, numSector %d]\n",item->regionID,(int)item->diskAddress.raidAddress, (int)item->diskAddress.numSector);
869
870      /* see if we moved to a new region */
871      if (regionID != item->regionID)
872	{
873	  RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
874	  regionID = item->regionID;
875	  RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
876	  RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
877	}
878
879      punt = RF_FALSE;  /* Set to RF_TRUE if work is blocked.  This can happen in one of two ways:
880		          1) no core log (AcquireParityLog)
881			  2) waiting on reintegration (DumpParityLogToDisk)
882			If punt is RF_TRUE, the dataItem was queued, so skip to next item.
883			*/
884
885      /* process item, one sector at a time, until all sectors processed or we punt */
886      if (item->diskAddress.numSector > 0)
887	done = RF_FALSE;
888      else
889	RF_ASSERT(0);
890      while (!punt && !done)
891	{
892	  /* verify that a core log exists for this region */
893	  if (!raidPtr->regionInfo[regionID].coreLog)
894	    {
895	      /* Attempt to acquire a parity log.
896		 If acquisition fails, queue remaining work in data item and move to nextItem.
897		 */
898	      if (incomingLog)
899		if (*incomingLog)
900		  {
901		    RF_ASSERT((*incomingLog)->next == NULL);
902		    raidPtr->regionInfo[regionID].coreLog = *incomingLog;
903		    raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
904		    *incomingLog = NULL;
905		  }
906		else
907		  raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
908	      else
909		raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
910	      /* Note: AcquireParityLog either returns a log or enqueues currentItem */
911	    }
912	  if (!raidPtr->regionInfo[regionID].coreLog)
913	    punt = RF_TRUE; /* failed to find a core log */
914	  else
915	    {
916	      RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
917	      /* verify that the log has room for new entries */
918	      /* if log is full, dump it to disk and grab a new log */
919	      if (raidPtr->regionInfo[regionID].coreLog->numRecords == raidPtr->numSectorsPerLog)
920		{
921		  /* log is full, dump it to disk */
922		  if (DumpParityLogToDisk(finish, item))
923		    punt = RF_TRUE; /* dump unsuccessful, blocked on reintegration */
924		  else
925		    {
926		      /* dump was successful */
927		      if (incomingLog)
928			if (*incomingLog)
929			  {
930			    RF_ASSERT((*incomingLog)->next == NULL);
931			    raidPtr->regionInfo[regionID].coreLog = *incomingLog;
932			    raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
933			    *incomingLog = NULL;
934			  }
935			else
936			  raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
937		      else
938			raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
939		      /* if a core log is not available, must queue work and return */
940		      if (!raidPtr->regionInfo[regionID].coreLog)
941			punt = RF_TRUE; /* blocked on log availability */
942		    }
943		}
944	    }
945	  /* if we didn't punt on this item, attempt to add a sector to the core log */
946	  if (!punt)
947	    {
948	      RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
949	      /* at this point, we have a core log with enough room for a sector */
950	      /* copy a sector into the log */
951	      log = raidPtr->regionInfo[regionID].coreLog;
952	      RF_ASSERT(log->numRecords < raidPtr->numSectorsPerLog);
953	      logItem = log->numRecords++;
954	      log->records[logItem].parityAddr = item->diskAddress;
955	      RF_ASSERT(log->records[logItem].parityAddr.startSector >= raidPtr->regionInfo[regionID].parityStartAddr);
956	      RF_ASSERT(log->records[logItem].parityAddr.startSector < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity);
957	      log->records[logItem].parityAddr.numSector = 1;
958	      log->records[logItem].operation = item->common->operation;
959	      bcopy((item->common->bufPtr + (item->bufOffset++ * (1<<item->common->raidPtr->logBytesPerSector))), log->bufPtr + (logItem * (1<<item->common->raidPtr->logBytesPerSector)), (1<<item->common->raidPtr->logBytesPerSector));
960	      item->diskAddress.numSector--;
961	      item->diskAddress.startSector++;
962	      if (item->diskAddress.numSector == 0)
963		done = RF_TRUE;
964	    }
965	}
966
967      if (!punt)
968	{
969	  /* Processed this item completely, decrement count of items
970	     to be processed.
971	     */
972	  RF_ASSERT(item->diskAddress.numSector == 0);
973	  RF_LOCK_MUTEX(item->common->mutex);
974	  item->common->cnt--;
975	  if (item->common->cnt == 0)
976	    itemDone = RF_TRUE;
977	  else
978	    itemDone = RF_FALSE;
979	  RF_UNLOCK_MUTEX(item->common->mutex);
980	  if (itemDone)
981	    {
982	      /* Finished processing all log data for this IO
983		 Return structs to free list and invoke wakeup function.
984		 */
985	      timer = item->common->startTime;  /* grab initial value of timer */
986	      RF_ETIMER_STOP(timer);
987	      RF_ETIMER_EVAL(timer);
988	      item->common->tracerec->plog_us += RF_ETIMER_VAL_US(timer);
989	      if (rf_parityLogDebug)
990		printf("[waking process for region %d]\n", item->regionID);
991	      wakeFunc = item->common->wakeFunc;
992	      wakeArg = item->common->wakeArg;
993	      FreeParityLogCommonData(item->common);
994	      FreeParityLogData(item);
995	      (wakeFunc)(wakeArg, 0);
996	    }
997	  else
998	    FreeParityLogData(item);
999	}
1000    }
1001  RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
1002  if (rf_parityLogDebug)
1003    printf("[exiting ParityLogAppend]\n");
1004  return(0);
1005}
1006
1007
1008void rf_EnableParityLogging(RF_Raid_t *raidPtr)
1009{
1010  int regionID;
1011
1012  for (regionID = 0; regionID < rf_numParityRegions; regionID++) {
1013    RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
1014    raidPtr->regionInfo[regionID].loggingEnabled = RF_TRUE;
1015    RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
1016  }
1017  if (rf_parityLogDebug)
1018    printf("[parity logging enabled]\n");
1019}
1020
1021#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
1022