rf_stripelocks.c revision 1.17
1/*	$NetBSD: rf_stripelocks.c,v 1.17 2003/12/29 03:33:48 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Authors: Mark Holland, Jim Zelenka
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*
30 * stripelocks.c -- code to lock stripes for read and write access
31 *
32 * The code distinguishes between read locks and write locks. There can be
33 * as many readers to given stripe as desired. When a write request comes
34 * in, no further readers are allowed to enter, and all subsequent requests
35 * are queued in FIFO order. When a the number of readers goes to zero, the
36 * writer is given the lock. When a writer releases the lock, the list of
37 * queued requests is scanned, and all readersq up to the next writer are
38 * given the lock.
39 *
40 * The lock table size must be one less than a power of two, but HASH_STRIPEID
41 * is the only function that requires this.
42 *
43 * The code now supports "range locks". When you ask to lock a stripe, you
44 * specify a range of addresses in that stripe that you want to lock. When
45 * you acquire the lock, you've locked only this range of addresses, and
46 * other threads can concurrently read/write any non-overlapping portions
47 * of the stripe. The "addresses" that you lock are abstract in that you
48 * can pass in anything you like.  The expectation is that you'll pass in
49 * the range of physical disk offsets of the parity bits you're planning
50 * to update. The idea behind this, of course, is to allow sub-stripe
51 * locking. The implementation is perhaps not the best imaginable; in the
52 * worst case a lock release is O(n^2) in the total number of outstanding
53 * requests to a given stripe.  Note that if you're striping with a
54 * stripe unit size equal to an entire disk (i.e. not striping), there will
55 * be only one stripe and you may spend some significant number of cycles
56 * searching through stripe lock descriptors.
57 */
58
59#include <sys/cdefs.h>
60__KERNEL_RCSID(0, "$NetBSD: rf_stripelocks.c,v 1.17 2003/12/29 03:33:48 oster Exp $");
61
62#include <dev/raidframe/raidframevar.h>
63
64#include "rf_raid.h"
65#include "rf_stripelocks.h"
66#include "rf_alloclist.h"
67#include "rf_debugprint.h"
68#include "rf_general.h"
69#include "rf_driver.h"
70#include "rf_shutdown.h"
71
72#ifdef DEBUG
73
74#define Dprintf1(s,a)         rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
75#define Dprintf2(s,a,b)       rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
76#define Dprintf3(s,a,b,c)     rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
77#define Dprintf4(s,a,b,c,d)   rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
78#define Dprintf5(s,a,b,c,d,e) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
79#define Dprintf6(s,a,b,c,d,e,f) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
80#define Dprintf7(s,a,b,c,d,e,f,g) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
81#define Dprintf8(s,a,b,c,d,e,f,g,h) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),(void *)((unsigned long)h))
82
83#else /* DEBUG */
84
85#define Dprintf1(s,a) {}
86#define Dprintf2(s,a,b) {}
87#define Dprintf3(s,a,b,c) {}
88#define Dprintf4(s,a,b,c,d) {}
89#define Dprintf5(s,a,b,c,d,e) {}
90#define Dprintf6(s,a,b,c,d,e,f) {}
91#define Dprintf7(s,a,b,c,d,e,f,g) {}
92#define Dprintf8(s,a,b,c,d,e,f,g,h) {}
93
94#endif /* DEBUG */
95
96#define FLUSH
97
98#define HASH_STRIPEID(_sid_)  ( (_sid_) & (rf_lockTableSize-1) )
99
100static void AddToWaitersQueue(RF_StripeLockDesc_t * lockDesc, RF_LockReqDesc_t * lockReqDesc);
101static RF_StripeLockDesc_t *AllocStripeLockDesc(RF_StripeNum_t stripeID);
102static void FreeStripeLockDesc(RF_StripeLockDesc_t * p);
103static RF_LockTableEntry_t *rf_MakeLockTable(void);
104#if RF_DEBUG_STRIPELOCK
105static void PrintLockedStripes(RF_LockTableEntry_t * lockTable);
106#endif
107
108/* determines if two ranges overlap.  always yields false if either start value is negative  */
109#define SINGLE_RANGE_OVERLAP(_strt1, _stop1, _strt2, _stop2)                                     \
110  ( (_strt1 >= 0) && (_strt2 >= 0) && (RF_MAX(_strt1, _strt2) <= RF_MIN(_stop1, _stop2)) )
111
112/* determines if any of the ranges specified in the two lock descriptors overlap each other */
113#define RANGE_OVERLAP(_cand, _pred)                                                  \
114  ( SINGLE_RANGE_OVERLAP((_cand)->start,  (_cand)->stop,  (_pred)->start,  (_pred)->stop ) ||    \
115    SINGLE_RANGE_OVERLAP((_cand)->start2, (_cand)->stop2, (_pred)->start,  (_pred)->stop ) ||    \
116    SINGLE_RANGE_OVERLAP((_cand)->start,  (_cand)->stop,  (_pred)->start2, (_pred)->stop2) ||    \
117    SINGLE_RANGE_OVERLAP((_cand)->start2, (_cand)->stop2, (_pred)->start2, (_pred)->stop2) )
118
119/* Determines if a candidate lock request conflicts with a predecessor lock req.
120 * Note that the arguments are not interchangeable.
121 * The rules are:
122 *      a candidate read conflicts with a predecessor write if any ranges overlap
123 *      a candidate write conflicts with a predecessor read if any ranges overlap
124 *      a candidate write conflicts with a predecessor write if any ranges overlap
125 */
126#define STRIPELOCK_CONFLICT(_cand, _pred)                                        \
127  RANGE_OVERLAP((_cand), (_pred)) &&                                        \
128  ( ( (((_cand)->type == RF_IO_TYPE_READ) && ((_pred)->type == RF_IO_TYPE_WRITE)) ||                      \
129      (((_cand)->type == RF_IO_TYPE_WRITE) && ((_pred)->type == RF_IO_TYPE_READ)) ||                      \
130      (((_cand)->type == RF_IO_TYPE_WRITE) && ((_pred)->type == RF_IO_TYPE_WRITE))                         \
131    )                                                                            \
132  )
133
134static struct pool rf_stripelock_pool;
135#define RF_MAX_FREE_STRIPELOCK 128
136#define RF_STRIPELOCK_INC        8
137#define RF_STRIPELOCK_INITIAL   32
138
139static void rf_ShutdownStripeLocks(RF_LockTableEntry_t * lockTable);
140static void rf_ShutdownStripeLockFreeList(void *);
141static void rf_RaidShutdownStripeLocks(void *);
142
143static void
144rf_ShutdownStripeLockFreeList(ignored)
145	void   *ignored;
146{
147	pool_destroy(&rf_stripelock_pool);
148}
149
150int
151rf_ConfigureStripeLockFreeList(listp)
152	RF_ShutdownList_t **listp;
153{
154	unsigned mask;
155	int     rc;
156
157	pool_init(&rf_stripelock_pool, sizeof(RF_StripeLockDesc_t),
158		  0, 0, 0, "rf_stripelock_pl", NULL);
159	pool_sethiwat(&rf_stripelock_pool, RF_MAX_FREE_STRIPELOCK);
160	pool_prime(&rf_stripelock_pool, RF_STRIPELOCK_INITIAL);
161
162	rc = rf_ShutdownCreate(listp, rf_ShutdownStripeLockFreeList, NULL);
163	if (rc) {
164		rf_print_unable_to_add_shutdown(__FILE__, __LINE__, rc);
165		rf_ShutdownStripeLockFreeList(NULL);
166		return (rc);
167	}
168
169	for (mask = 0x1; mask; mask <<= 1)
170		if (rf_lockTableSize == mask)
171			break;
172	if (!mask) {
173		printf("[WARNING:  lock table size must be a power of two.  Setting to %d.]\n", RF_DEFAULT_LOCK_TABLE_SIZE);
174		rf_lockTableSize = RF_DEFAULT_LOCK_TABLE_SIZE;
175	}
176	return (0);
177}
178
179static RF_LockTableEntry_t *
180rf_MakeLockTable()
181{
182	RF_LockTableEntry_t *lockTable;
183	int     i, rc;
184
185	RF_Malloc(lockTable,
186		  ((int) rf_lockTableSize) * sizeof(RF_LockTableEntry_t),
187		  (RF_LockTableEntry_t *));
188	if (lockTable == NULL)
189		return (NULL);
190	for (i = 0; i < rf_lockTableSize; i++) {
191		rc = rf_mutex_init(&lockTable[i].mutex);
192		if (rc) {
193			rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc);
194			/* XXX clean up other mutexes */
195			return (NULL);
196		}
197	}
198	return (lockTable);
199}
200
201static void
202rf_ShutdownStripeLocks(RF_LockTableEntry_t * lockTable)
203{
204	int     i;
205
206#if RF_DEBUG_STRIPELOCK
207	if (rf_stripeLockDebug) {
208		PrintLockedStripes(lockTable);
209	}
210#endif
211	for (i = 0; i < rf_lockTableSize; i++) {
212		rf_mutex_destroy(&lockTable[i].mutex);
213	}
214	RF_Free(lockTable, rf_lockTableSize * sizeof(RF_LockTableEntry_t));
215}
216
217static void
218rf_RaidShutdownStripeLocks(arg)
219	void   *arg;
220{
221	RF_Raid_t *raidPtr = (RF_Raid_t *) arg;
222	rf_ShutdownStripeLocks(raidPtr->lockTable);
223}
224
225int
226rf_ConfigureStripeLocks(
227    RF_ShutdownList_t ** listp,
228    RF_Raid_t * raidPtr,
229    RF_Config_t * cfgPtr)
230{
231	int     rc;
232
233	raidPtr->lockTable = rf_MakeLockTable();
234	if (raidPtr->lockTable == NULL)
235		return (ENOMEM);
236	rc = rf_ShutdownCreate(listp, rf_RaidShutdownStripeLocks, raidPtr);
237	if (rc) {
238		rf_print_unable_to_add_shutdown(__FILE__, __LINE__, rc);
239		rf_ShutdownStripeLocks(raidPtr->lockTable);
240		return (rc);
241	}
242	return (0);
243}
244/* returns 0 if you've got the lock, and non-zero if you have to wait.
245 * if and only if you have to wait, we'll cause cbFunc to get invoked
246 * with cbArg when you are granted the lock.  We store a tag in *releaseTag
247 * that you need to give back to us when you release the lock.
248 */
249int
250rf_AcquireStripeLock(
251    RF_LockTableEntry_t * lockTable,
252    RF_StripeNum_t stripeID,
253    RF_LockReqDesc_t * lockReqDesc)
254{
255	RF_StripeLockDesc_t *lockDesc;
256	RF_LockReqDesc_t *p;
257#if defined(DEBUG) && (RF_DEBUG_STRIPELOCK > 0)
258	int     tid = 0;
259#endif
260	int     hashval = HASH_STRIPEID(stripeID);
261	int     retcode = 0;
262
263	RF_ASSERT(RF_IO_IS_R_OR_W(lockReqDesc->type));
264
265#if RF_DEBUG_STRIPELOCK
266	if (rf_stripeLockDebug) {
267		if (stripeID == -1) {
268			Dprintf1("[%d] Lock acquisition supressed (stripeID == -1)\n", tid);
269		} else {
270			Dprintf8("[%d] Trying to acquire stripe lock table 0x%lx SID %ld type %c range %ld-%ld, range2 %ld-%ld hashval %d\n",
271			    tid, (unsigned long) lockTable, stripeID, lockReqDesc->type, lockReqDesc->start,
272			    lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2);
273			Dprintf3("[%d] lock %ld hashval %d\n", tid, stripeID, hashval);
274			FLUSH;
275		}
276	}
277#endif
278	if (stripeID == -1)
279		return (0);
280	lockReqDesc->next = NULL;	/* just to be sure */
281
282	RF_LOCK_MUTEX(lockTable[hashval].mutex);
283	for (lockDesc = lockTable[hashval].descList; lockDesc; lockDesc = lockDesc->next) {
284		if (lockDesc->stripeID == stripeID)
285			break;
286	}
287
288	if (!lockDesc) {	/* no entry in table => no one reading or
289				 * writing */
290		lockDesc = AllocStripeLockDesc(stripeID);
291		lockDesc->next = lockTable[hashval].descList;
292		lockTable[hashval].descList = lockDesc;
293		if (lockReqDesc->type == RF_IO_TYPE_WRITE)
294			lockDesc->nWriters++;
295		lockDesc->granted = lockReqDesc;
296#if RF_DEBUG_STRIPELOCK
297		if (rf_stripeLockDebug) {
298			Dprintf7("[%d] no one waiting: lock %ld %c %ld-%ld %ld-%ld granted\n",
299			    tid, stripeID, lockReqDesc->type, lockReqDesc->start, lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2);
300			FLUSH;
301		}
302#endif
303	} else {
304
305		if (lockReqDesc->type == RF_IO_TYPE_WRITE)
306			lockDesc->nWriters++;
307
308		if (lockDesc->nWriters == 0) {	/* no need to search any lists
309						 * if there are no writers
310						 * anywhere */
311			lockReqDesc->next = lockDesc->granted;
312			lockDesc->granted = lockReqDesc;
313#if RF_DEBUG_STRIPELOCK
314			if (rf_stripeLockDebug) {
315				Dprintf7("[%d] no writers: lock %ld %c %ld-%ld %ld-%ld granted\n",
316				    tid, stripeID, lockReqDesc->type, lockReqDesc->start, lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2);
317				FLUSH;
318			}
319#endif
320		} else {
321
322			/* search the granted & waiting lists for a conflict.
323			 * stop searching as soon as we find one */
324			retcode = 0;
325			for (p = lockDesc->granted; p; p = p->next)
326				if (STRIPELOCK_CONFLICT(lockReqDesc, p)) {
327					retcode = 1;
328					break;
329				}
330			if (!retcode)
331				for (p = lockDesc->waitersH; p; p = p->next)
332					if (STRIPELOCK_CONFLICT(lockReqDesc, p)) {
333						retcode = 2;
334						break;
335					}
336			if (!retcode) {
337				lockReqDesc->next = lockDesc->granted;	/* no conflicts found =>
338									 * grant lock */
339				lockDesc->granted = lockReqDesc;
340#if RF_DEBUG_STRIPELOCK
341				if (rf_stripeLockDebug) {
342					Dprintf7("[%d] no conflicts: lock %ld %c %ld-%ld %ld-%ld granted\n",
343					    tid, stripeID, lockReqDesc->type, lockReqDesc->start, lockReqDesc->stop,
344					    lockReqDesc->start2, lockReqDesc->stop2);
345					FLUSH;
346				}
347#endif
348			} else {
349#if RF_DEBUG_STRIPELOCK
350				if (rf_stripeLockDebug) {
351					Dprintf6("[%d] conflict: lock %ld %c %ld-%ld hashval=%d not granted\n",
352					    tid, stripeID, lockReqDesc->type, lockReqDesc->start, lockReqDesc->stop,
353					    hashval);
354					Dprintf3("[%d] lock %ld retcode=%d\n", tid, stripeID, retcode);
355					FLUSH;
356				}
357#endif
358				AddToWaitersQueue(lockDesc, lockReqDesc);
359				/* conflict => the current access must wait */
360			}
361		}
362	}
363
364	RF_UNLOCK_MUTEX(lockTable[hashval].mutex);
365	return (retcode);
366}
367
368void
369rf_ReleaseStripeLock(
370    RF_LockTableEntry_t * lockTable,
371    RF_StripeNum_t stripeID,
372    RF_LockReqDesc_t * lockReqDesc)
373{
374	RF_StripeLockDesc_t *lockDesc, *ld_t;
375	RF_LockReqDesc_t *lr, *lr_t, *callbacklist, *t;
376#if defined(DEBUG) && (RF_DEBUG_STRIPELOCK > 0)
377	int     tid = 0;
378#endif
379	int     hashval = HASH_STRIPEID(stripeID);
380	int     release_it, consider_it;
381	RF_LockReqDesc_t *candidate, *candidate_t, *predecessor;
382
383	RF_ASSERT(RF_IO_IS_R_OR_W(lockReqDesc->type));
384
385#if RF_DEBUG_STRIPELOCK
386	if (rf_stripeLockDebug) {
387		if (stripeID == -1) {
388			Dprintf1("[%d] Lock release supressed (stripeID == -1)\n", tid);
389		} else {
390			Dprintf8("[%d] Releasing stripe lock on stripe ID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
391			    tid, stripeID, lockReqDesc->type, lockReqDesc->start, lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2, lockTable);
392			FLUSH;
393		}
394	}
395#endif
396	if (stripeID == -1)
397		return;
398
399	RF_LOCK_MUTEX(lockTable[hashval].mutex);
400
401	/* find the stripe lock descriptor */
402	for (ld_t = NULL, lockDesc = lockTable[hashval].descList; lockDesc; ld_t = lockDesc, lockDesc = lockDesc->next) {
403		if (lockDesc->stripeID == stripeID)
404			break;
405	}
406	RF_ASSERT(lockDesc);	/* major error to release a lock that doesn't
407				 * exist */
408
409	/* find the stripe lock request descriptor & delete it from the list */
410	for (lr_t = NULL, lr = lockDesc->granted; lr; lr_t = lr, lr = lr->next)
411		if (lr == lockReqDesc)
412			break;
413
414	RF_ASSERT(lr && (lr == lockReqDesc));	/* major error to release a
415						 * lock that hasn't been
416						 * granted */
417	if (lr_t)
418		lr_t->next = lr->next;
419	else {
420		RF_ASSERT(lr == lockDesc->granted);
421		lockDesc->granted = lr->next;
422	}
423	lr->next = NULL;
424
425	if (lockReqDesc->type == RF_IO_TYPE_WRITE)
426		lockDesc->nWriters--;
427
428	/* search through the waiters list to see if anyone needs to be woken
429	 * up. for each such descriptor in the wait list, we check it against
430	 * everything granted and against everything _in front_ of it in the
431	 * waiters queue.  If it conflicts with none of these, we release it.
432	 *
433	 * DON'T TOUCH THE TEMPLINK POINTER OF ANYTHING IN THE GRANTED LIST HERE.
434	 * This will roach the case where the callback tries to acquire a new
435	 * lock in the same stripe.  There are some asserts to try and detect
436	 * this.
437	 *
438	 * We apply 2 performance optimizations: (1) if releasing this lock
439	 * results in no more writers to this stripe, we just release
440	 * everybody waiting, since we place no restrictions on the number of
441	 * concurrent reads. (2) we consider as candidates for wakeup only
442	 * those waiters that have a range overlap with either the descriptor
443	 * being woken up or with something in the callbacklist (i.e.
444	 * something we've just now woken up). This allows us to avoid the
445	 * long evaluation for some descriptors. */
446
447	callbacklist = NULL;
448	if (lockDesc->nWriters == 0) {	/* performance tweak (1) */
449		while (lockDesc->waitersH) {
450
451			lr = lockDesc->waitersH;	/* delete from waiters
452							 * list */
453			lockDesc->waitersH = lr->next;
454
455			RF_ASSERT(lr->type == RF_IO_TYPE_READ);
456
457			lr->next = lockDesc->granted;	/* add to granted list */
458			lockDesc->granted = lr;
459
460			RF_ASSERT(!lr->templink);
461			lr->templink = callbacklist;	/* put on callback list
462							 * so that we'll invoke
463							 * callback below */
464			callbacklist = lr;
465#if RF_DEBUG_STRIPELOCK
466			if (rf_stripeLockDebug) {
467				Dprintf8("[%d] No writers: granting lock stripe ID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
468				    tid, stripeID, lr->type, lr->start, lr->stop, lr->start2, lr->stop2, (unsigned long) lockTable);
469				FLUSH;
470			}
471#endif
472		}
473		lockDesc->waitersT = NULL;	/* we've purged the whole
474						 * waiters list */
475
476	} else
477		for (candidate_t = NULL, candidate = lockDesc->waitersH; candidate;) {
478
479			/* performance tweak (2) */
480			consider_it = 0;
481			if (RANGE_OVERLAP(lockReqDesc, candidate))
482				consider_it = 1;
483			else
484				for (t = callbacklist; t; t = t->templink)
485					if (RANGE_OVERLAP(t, candidate)) {
486						consider_it = 1;
487						break;
488					}
489			if (!consider_it) {
490#if RF_DEBUG_STRIPELOCK
491				if (rf_stripeLockDebug) {
492					Dprintf8("[%d] No overlap: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
493					    tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
494					    (unsigned long) lockTable);
495					FLUSH;
496				}
497#endif
498				candidate_t = candidate;
499				candidate = candidate->next;
500				continue;
501			}
502			/* we have a candidate for release.  check to make
503			 * sure it is not blocked by any granted locks */
504			release_it = 1;
505			for (predecessor = lockDesc->granted; predecessor; predecessor = predecessor->next) {
506				if (STRIPELOCK_CONFLICT(candidate, predecessor)) {
507#if RF_DEBUG_STRIPELOCK
508					if (rf_stripeLockDebug) {
509						Dprintf8("[%d] Conflicts with granted lock: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
510						    tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
511						    (unsigned long) lockTable);
512						FLUSH;
513					}
514#endif
515					release_it = 0;
516					break;
517				}
518			}
519
520			/* now check to see if the candidate is blocked by any
521			 * waiters that occur before it it the wait queue */
522			if (release_it)
523				for (predecessor = lockDesc->waitersH; predecessor != candidate; predecessor = predecessor->next) {
524					if (STRIPELOCK_CONFLICT(candidate, predecessor)) {
525#if RF_DEBUG_STRIPELOCK
526						if (rf_stripeLockDebug) {
527							Dprintf8("[%d] Conflicts with waiting lock: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
528							    tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
529							    (unsigned long) lockTable);
530							FLUSH;
531						}
532#endif
533						release_it = 0;
534						break;
535					}
536				}
537
538			/* release it if indicated */
539			if (release_it) {
540#if RF_DEBUG_STRIPELOCK
541				if (rf_stripeLockDebug) {
542					Dprintf8("[%d] Granting lock to candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
543					    tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
544					    (unsigned long) lockTable);
545					FLUSH;
546				}
547#endif
548				if (candidate_t) {
549					candidate_t->next = candidate->next;
550					if (lockDesc->waitersT == candidate)
551						lockDesc->waitersT = candidate_t;	/* cannot be waitersH since candidate_t is not NULL */
552				} else {
553					RF_ASSERT(candidate == lockDesc->waitersH);
554					lockDesc->waitersH = lockDesc->waitersH->next;
555					if (!lockDesc->waitersH)
556						lockDesc->waitersT = NULL;
557				}
558				candidate->next = lockDesc->granted;	/* move it to the
559									 * granted list */
560				lockDesc->granted = candidate;
561
562				RF_ASSERT(!candidate->templink);
563				candidate->templink = callbacklist;	/* put it on the list of
564									 * things to be called
565									 * after we release the
566									 * mutex */
567				callbacklist = candidate;
568
569				if (!candidate_t)
570					candidate = lockDesc->waitersH;
571				else
572					candidate = candidate_t->next;	/* continue with the
573									 * rest of the list */
574			} else {
575				candidate_t = candidate;
576				candidate = candidate->next;	/* continue with the
577								 * rest of the list */
578			}
579		}
580
581	/* delete the descriptor if no one is waiting or active */
582	if (!lockDesc->granted && !lockDesc->waitersH) {
583		RF_ASSERT(lockDesc->nWriters == 0);
584#if RF_DEBUG_STRIPELOCK
585		if (rf_stripeLockDebug) {
586			Dprintf3("[%d] Last lock released (table 0x%lx): deleting desc for stripeID %ld\n", tid, (unsigned long) lockTable, stripeID);
587			FLUSH;
588		}
589#endif
590		if (ld_t)
591			ld_t->next = lockDesc->next;
592		else {
593			RF_ASSERT(lockDesc == lockTable[hashval].descList);
594			lockTable[hashval].descList = lockDesc->next;
595		}
596		FreeStripeLockDesc(lockDesc);
597		lockDesc = NULL;/* only for the ASSERT below */
598	}
599	RF_UNLOCK_MUTEX(lockTable[hashval].mutex);
600
601	/* now that we've unlocked the mutex, invoke the callback on all the
602	 * descriptors in the list */
603	RF_ASSERT(!((callbacklist) && (!lockDesc)));	/* if we deleted the
604							 * descriptor, we should
605							 * have no callbacks to
606							 * do */
607	for (candidate = callbacklist; candidate;) {
608		t = candidate;
609		candidate = candidate->templink;
610		t->templink = NULL;
611		(t->cbFunc) (t->cbArg);
612	}
613}
614/* must have the indicated lock table mutex upon entry */
615static void
616AddToWaitersQueue(
617    RF_StripeLockDesc_t * lockDesc,
618    RF_LockReqDesc_t * lockReqDesc)
619{
620	if (!lockDesc->waitersH) {
621		lockDesc->waitersH = lockDesc->waitersT = lockReqDesc;
622	} else {
623		lockDesc->waitersT->next = lockReqDesc;
624		lockDesc->waitersT = lockReqDesc;
625	}
626}
627
628static RF_StripeLockDesc_t *
629AllocStripeLockDesc(RF_StripeNum_t stripeID)
630{
631	RF_StripeLockDesc_t *p;
632
633	p = pool_get(&rf_stripelock_pool, PR_WAITOK);
634	if (p) {
635		p->stripeID = stripeID;
636		p->granted = NULL;
637		p->waitersH = NULL;
638		p->waitersT = NULL;
639		p->nWriters = 0;
640		p->next = NULL;
641	}
642	return (p);
643}
644
645static void
646FreeStripeLockDesc(RF_StripeLockDesc_t * p)
647{
648	pool_put(&rf_stripelock_pool, p);
649}
650
651#if RF_DEBUG_STRIPELOCK
652static void
653PrintLockedStripes(lockTable)
654	RF_LockTableEntry_t *lockTable;
655{
656	int     i, j, foundone = 0, did;
657	RF_StripeLockDesc_t *p;
658	RF_LockReqDesc_t *q;
659
660	RF_LOCK_MUTEX(rf_printf_mutex);
661	printf("Locked stripes:\n");
662	for (i = 0; i < rf_lockTableSize; i++)
663		if (lockTable[i].descList) {
664			foundone = 1;
665			for (p = lockTable[i].descList; p; p = p->next) {
666				printf("Stripe ID 0x%lx (%d) nWriters %d\n",
667				    (long) p->stripeID, (int) p->stripeID, p->nWriters);
668
669				if (!(p->granted))
670					printf("Granted: (none)\n");
671				else
672					printf("Granted:\n");
673				for (did = 1, j = 0, q = p->granted; q; j++, q = q->next) {
674					printf("  %c(%ld-%ld", q->type, (long) q->start, (long) q->stop);
675					if (q->start2 != -1)
676						printf(",%ld-%ld) ", (long) q->start2,
677						    (long) q->stop2);
678					else
679						printf(") ");
680					if (j && !(j % 4)) {
681						printf("\n");
682						did = 1;
683					} else
684						did = 0;
685				}
686				if (!did)
687					printf("\n");
688
689				if (!(p->waitersH))
690					printf("Waiting: (none)\n");
691				else
692					printf("Waiting:\n");
693				for (did = 1, j = 0, q = p->waitersH; q; j++, q = q->next) {
694					printf("%c(%ld-%ld", q->type, (long) q->start, (long) q->stop);
695					if (q->start2 != -1)
696						printf(",%ld-%ld) ", (long) q->start2, (long) q->stop2);
697					else
698						printf(") ");
699					if (j && !(j % 4)) {
700						printf("\n         ");
701						did = 1;
702					} else
703						did = 0;
704				}
705				if (!did)
706					printf("\n");
707			}
708		}
709	if (!foundone)
710		printf("(none)\n");
711	else
712		printf("\n");
713	RF_UNLOCK_MUTEX(rf_printf_mutex);
714}
715#endif
716