kern_rwlock.c revision 256281
1/*-
2 * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of any co-contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/*
31 * Machine independent bits of reader/writer lock implementation.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/10/sys/kern/kern_rwlock.c 255788 2013-09-22 14:09:07Z davide $");
36
37#include "opt_ddb.h"
38#include "opt_hwpmc_hooks.h"
39#include "opt_kdtrace.h"
40#include "opt_no_adaptive_rwlocks.h"
41
42#include <sys/param.h>
43#include <sys/kdb.h>
44#include <sys/ktr.h>
45#include <sys/kernel.h>
46#include <sys/lock.h>
47#include <sys/mutex.h>
48#include <sys/proc.h>
49#include <sys/rwlock.h>
50#include <sys/sysctl.h>
51#include <sys/systm.h>
52#include <sys/turnstile.h>
53
54#include <machine/cpu.h>
55
56#if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS)
57#define	ADAPTIVE_RWLOCKS
58#endif
59
60#ifdef HWPMC_HOOKS
61#include <sys/pmckern.h>
62PMC_SOFT_DECLARE( , , lock, failed);
63#endif
64
65/*
66 * Return the rwlock address when the lock cookie address is provided.
67 * This functionality assumes that struct rwlock* have a member named rw_lock.
68 */
69#define	rwlock2rw(c)	(__containerof(c, struct rwlock, rw_lock))
70
71#ifdef ADAPTIVE_RWLOCKS
72static int rowner_retries = 10;
73static int rowner_loops = 10000;
74static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
75    "rwlock debugging");
76SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
77SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, "");
78#endif
79
80#ifdef DDB
81#include <ddb/ddb.h>
82
83static void	db_show_rwlock(const struct lock_object *lock);
84#endif
85static void	assert_rw(const struct lock_object *lock, int what);
86static void	lock_rw(struct lock_object *lock, uintptr_t how);
87#ifdef KDTRACE_HOOKS
88static int	owner_rw(const struct lock_object *lock, struct thread **owner);
89#endif
90static uintptr_t unlock_rw(struct lock_object *lock);
91
92struct lock_class lock_class_rw = {
93	.lc_name = "rw",
94	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE,
95	.lc_assert = assert_rw,
96#ifdef DDB
97	.lc_ddb_show = db_show_rwlock,
98#endif
99	.lc_lock = lock_rw,
100	.lc_unlock = unlock_rw,
101#ifdef KDTRACE_HOOKS
102	.lc_owner = owner_rw,
103#endif
104};
105
106/*
107 * Return a pointer to the owning thread if the lock is write-locked or
108 * NULL if the lock is unlocked or read-locked.
109 */
110#define	rw_wowner(rw)							\
111	((rw)->rw_lock & RW_LOCK_READ ? NULL :				\
112	    (struct thread *)RW_OWNER((rw)->rw_lock))
113
114/*
115 * Returns if a write owner is recursed.  Write ownership is not assured
116 * here and should be previously checked.
117 */
118#define	rw_recursed(rw)		((rw)->rw_recurse != 0)
119
120/*
121 * Return true if curthread helds the lock.
122 */
123#define	rw_wlocked(rw)		(rw_wowner((rw)) == curthread)
124
125/*
126 * Return a pointer to the owning thread for this lock who should receive
127 * any priority lent by threads that block on this lock.  Currently this
128 * is identical to rw_wowner().
129 */
130#define	rw_owner(rw)		rw_wowner(rw)
131
132#ifndef INVARIANTS
133#define	__rw_assert(c, what, file, line)
134#endif
135
136void
137assert_rw(const struct lock_object *lock, int what)
138{
139
140	rw_assert((const struct rwlock *)lock, what);
141}
142
143void
144lock_rw(struct lock_object *lock, uintptr_t how)
145{
146	struct rwlock *rw;
147
148	rw = (struct rwlock *)lock;
149	if (how)
150		rw_rlock(rw);
151	else
152		rw_wlock(rw);
153}
154
155uintptr_t
156unlock_rw(struct lock_object *lock)
157{
158	struct rwlock *rw;
159
160	rw = (struct rwlock *)lock;
161	rw_assert(rw, RA_LOCKED | LA_NOTRECURSED);
162	if (rw->rw_lock & RW_LOCK_READ) {
163		rw_runlock(rw);
164		return (1);
165	} else {
166		rw_wunlock(rw);
167		return (0);
168	}
169}
170
171#ifdef KDTRACE_HOOKS
172int
173owner_rw(const struct lock_object *lock, struct thread **owner)
174{
175	const struct rwlock *rw = (const struct rwlock *)lock;
176	uintptr_t x = rw->rw_lock;
177
178	*owner = rw_wowner(rw);
179	return ((x & RW_LOCK_READ) != 0 ?  (RW_READERS(x) != 0) :
180	    (*owner != NULL));
181}
182#endif
183
184void
185_rw_init_flags(volatile uintptr_t *c, const char *name, int opts)
186{
187	struct rwlock *rw;
188	int flags;
189
190	rw = rwlock2rw(c);
191
192	MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET |
193	    RW_RECURSE)) == 0);
194	ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock,
195	    ("%s: rw_lock not aligned for %s: %p", __func__, name,
196	    &rw->rw_lock));
197
198	flags = LO_UPGRADABLE;
199	if (opts & RW_DUPOK)
200		flags |= LO_DUPOK;
201	if (opts & RW_NOPROFILE)
202		flags |= LO_NOPROFILE;
203	if (!(opts & RW_NOWITNESS))
204		flags |= LO_WITNESS;
205	if (opts & RW_RECURSE)
206		flags |= LO_RECURSABLE;
207	if (opts & RW_QUIET)
208		flags |= LO_QUIET;
209
210	lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags);
211	rw->rw_lock = RW_UNLOCKED;
212	rw->rw_recurse = 0;
213}
214
215void
216_rw_destroy(volatile uintptr_t *c)
217{
218	struct rwlock *rw;
219
220	rw = rwlock2rw(c);
221
222	KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw));
223	KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw));
224	rw->rw_lock = RW_DESTROYED;
225	lock_destroy(&rw->lock_object);
226}
227
228void
229rw_sysinit(void *arg)
230{
231	struct rw_args *args = arg;
232
233	rw_init((struct rwlock *)args->ra_rw, args->ra_desc);
234}
235
236void
237rw_sysinit_flags(void *arg)
238{
239	struct rw_args_flags *args = arg;
240
241	rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc,
242	    args->ra_flags);
243}
244
245int
246_rw_wowned(const volatile uintptr_t *c)
247{
248
249	return (rw_wowner(rwlock2rw(c)) == curthread);
250}
251
252void
253_rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line)
254{
255	struct rwlock *rw;
256
257	if (SCHEDULER_STOPPED())
258		return;
259
260	rw = rwlock2rw(c);
261
262	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
263	    ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d",
264	    curthread, rw->lock_object.lo_name, file, line));
265	KASSERT(rw->rw_lock != RW_DESTROYED,
266	    ("rw_wlock() of destroyed rwlock @ %s:%d", file, line));
267	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
268	    line, NULL);
269	__rw_wlock(rw, curthread, file, line);
270	LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line);
271	WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
272	curthread->td_locks++;
273}
274
275int
276__rw_try_wlock(volatile uintptr_t *c, const char *file, int line)
277{
278	struct rwlock *rw;
279	int rval;
280
281	if (SCHEDULER_STOPPED())
282		return (1);
283
284	rw = rwlock2rw(c);
285
286	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
287	    ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d",
288	    curthread, rw->lock_object.lo_name, file, line));
289	KASSERT(rw->rw_lock != RW_DESTROYED,
290	    ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line));
291
292	if (rw_wlocked(rw) &&
293	    (rw->lock_object.lo_flags & LO_RECURSABLE) != 0) {
294		rw->rw_recurse++;
295		rval = 1;
296	} else
297		rval = atomic_cmpset_acq_ptr(&rw->rw_lock, RW_UNLOCKED,
298		    (uintptr_t)curthread);
299
300	LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line);
301	if (rval) {
302		WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
303		    file, line);
304		curthread->td_locks++;
305	}
306	return (rval);
307}
308
309void
310_rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line)
311{
312	struct rwlock *rw;
313
314	if (SCHEDULER_STOPPED())
315		return;
316
317	rw = rwlock2rw(c);
318
319	KASSERT(rw->rw_lock != RW_DESTROYED,
320	    ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line));
321	__rw_assert(c, RA_WLOCKED, file, line);
322	WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
323	LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file,
324	    line);
325	if (!rw_recursed(rw))
326		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_WUNLOCK_RELEASE, rw);
327	__rw_wunlock(rw, curthread, file, line);
328	curthread->td_locks--;
329}
330/*
331 * Determines whether a new reader can acquire a lock.  Succeeds if the
332 * reader already owns a read lock and the lock is locked for read to
333 * prevent deadlock from reader recursion.  Also succeeds if the lock
334 * is unlocked and has no writer waiters or spinners.  Failing otherwise
335 * prioritizes writers before readers.
336 */
337#define	RW_CAN_READ(_rw)						\
338    ((curthread->td_rw_rlocks && (_rw) & RW_LOCK_READ) || ((_rw) &	\
339    (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER)) ==	\
340    RW_LOCK_READ)
341
342void
343__rw_rlock(volatile uintptr_t *c, const char *file, int line)
344{
345	struct rwlock *rw;
346	struct turnstile *ts;
347#ifdef ADAPTIVE_RWLOCKS
348	volatile struct thread *owner;
349	int spintries = 0;
350	int i;
351#endif
352#ifdef LOCK_PROFILING
353	uint64_t waittime = 0;
354	int contested = 0;
355#endif
356	uintptr_t v;
357#ifdef KDTRACE_HOOKS
358	uint64_t spin_cnt = 0;
359	uint64_t sleep_cnt = 0;
360	int64_t sleep_time = 0;
361#endif
362
363	if (SCHEDULER_STOPPED())
364		return;
365
366	rw = rwlock2rw(c);
367
368	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
369	    ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d",
370	    curthread, rw->lock_object.lo_name, file, line));
371	KASSERT(rw->rw_lock != RW_DESTROYED,
372	    ("rw_rlock() of destroyed rwlock @ %s:%d", file, line));
373	KASSERT(rw_wowner(rw) != curthread,
374	    ("rw_rlock: wlock already held for %s @ %s:%d",
375	    rw->lock_object.lo_name, file, line));
376	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL);
377
378	for (;;) {
379#ifdef KDTRACE_HOOKS
380		spin_cnt++;
381#endif
382		/*
383		 * Handle the easy case.  If no other thread has a write
384		 * lock, then try to bump up the count of read locks.  Note
385		 * that we have to preserve the current state of the
386		 * RW_LOCK_WRITE_WAITERS flag.  If we fail to acquire a
387		 * read lock, then rw_lock must have changed, so restart
388		 * the loop.  Note that this handles the case of a
389		 * completely unlocked rwlock since such a lock is encoded
390		 * as a read lock with no waiters.
391		 */
392		v = rw->rw_lock;
393		if (RW_CAN_READ(v)) {
394			/*
395			 * The RW_LOCK_READ_WAITERS flag should only be set
396			 * if the lock has been unlocked and write waiters
397			 * were present.
398			 */
399			if (atomic_cmpset_acq_ptr(&rw->rw_lock, v,
400			    v + RW_ONE_READER)) {
401				if (LOCK_LOG_TEST(&rw->lock_object, 0))
402					CTR4(KTR_LOCK,
403					    "%s: %p succeed %p -> %p", __func__,
404					    rw, (void *)v,
405					    (void *)(v + RW_ONE_READER));
406				break;
407			}
408			continue;
409		}
410#ifdef HWPMC_HOOKS
411		PMC_SOFT_CALL( , , lock, failed);
412#endif
413		lock_profile_obtain_lock_failed(&rw->lock_object,
414		    &contested, &waittime);
415
416#ifdef ADAPTIVE_RWLOCKS
417		/*
418		 * If the owner is running on another CPU, spin until
419		 * the owner stops running or the state of the lock
420		 * changes.
421		 */
422		if ((v & RW_LOCK_READ) == 0) {
423			owner = (struct thread *)RW_OWNER(v);
424			if (TD_IS_RUNNING(owner)) {
425				if (LOCK_LOG_TEST(&rw->lock_object, 0))
426					CTR3(KTR_LOCK,
427					    "%s: spinning on %p held by %p",
428					    __func__, rw, owner);
429				while ((struct thread*)RW_OWNER(rw->rw_lock) ==
430				    owner && TD_IS_RUNNING(owner)) {
431					cpu_spinwait();
432#ifdef KDTRACE_HOOKS
433					spin_cnt++;
434#endif
435				}
436				continue;
437			}
438		} else if (spintries < rowner_retries) {
439			spintries++;
440			for (i = 0; i < rowner_loops; i++) {
441				v = rw->rw_lock;
442				if ((v & RW_LOCK_READ) == 0 || RW_CAN_READ(v))
443					break;
444				cpu_spinwait();
445			}
446			if (i != rowner_loops)
447				continue;
448		}
449#endif
450
451		/*
452		 * Okay, now it's the hard case.  Some other thread already
453		 * has a write lock or there are write waiters present,
454		 * acquire the turnstile lock so we can begin the process
455		 * of blocking.
456		 */
457		ts = turnstile_trywait(&rw->lock_object);
458
459		/*
460		 * The lock might have been released while we spun, so
461		 * recheck its state and restart the loop if needed.
462		 */
463		v = rw->rw_lock;
464		if (RW_CAN_READ(v)) {
465			turnstile_cancel(ts);
466			continue;
467		}
468
469#ifdef ADAPTIVE_RWLOCKS
470		/*
471		 * The current lock owner might have started executing
472		 * on another CPU (or the lock could have changed
473		 * owners) while we were waiting on the turnstile
474		 * chain lock.  If so, drop the turnstile lock and try
475		 * again.
476		 */
477		if ((v & RW_LOCK_READ) == 0) {
478			owner = (struct thread *)RW_OWNER(v);
479			if (TD_IS_RUNNING(owner)) {
480				turnstile_cancel(ts);
481				continue;
482			}
483		}
484#endif
485
486		/*
487		 * The lock is held in write mode or it already has waiters.
488		 */
489		MPASS(!RW_CAN_READ(v));
490
491		/*
492		 * If the RW_LOCK_READ_WAITERS flag is already set, then
493		 * we can go ahead and block.  If it is not set then try
494		 * to set it.  If we fail to set it drop the turnstile
495		 * lock and restart the loop.
496		 */
497		if (!(v & RW_LOCK_READ_WAITERS)) {
498			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
499			    v | RW_LOCK_READ_WAITERS)) {
500				turnstile_cancel(ts);
501				continue;
502			}
503			if (LOCK_LOG_TEST(&rw->lock_object, 0))
504				CTR2(KTR_LOCK, "%s: %p set read waiters flag",
505				    __func__, rw);
506		}
507
508		/*
509		 * We were unable to acquire the lock and the read waiters
510		 * flag is set, so we must block on the turnstile.
511		 */
512		if (LOCK_LOG_TEST(&rw->lock_object, 0))
513			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
514			    rw);
515#ifdef KDTRACE_HOOKS
516		sleep_time -= lockstat_nsecs();
517#endif
518		turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE);
519#ifdef KDTRACE_HOOKS
520		sleep_time += lockstat_nsecs();
521		sleep_cnt++;
522#endif
523		if (LOCK_LOG_TEST(&rw->lock_object, 0))
524			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
525			    __func__, rw);
526	}
527
528	/*
529	 * TODO: acquire "owner of record" here.  Here be turnstile dragons
530	 * however.  turnstiles don't like owners changing between calls to
531	 * turnstile_wait() currently.
532	 */
533	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_RLOCK_ACQUIRE, rw, contested,
534	    waittime, file, line);
535	LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line);
536	WITNESS_LOCK(&rw->lock_object, 0, file, line);
537	curthread->td_locks++;
538	curthread->td_rw_rlocks++;
539#ifdef KDTRACE_HOOKS
540	if (sleep_time)
541		LOCKSTAT_RECORD1(LS_RW_RLOCK_BLOCK, rw, sleep_time);
542
543	/*
544	 * Record only the loops spinning and not sleeping.
545	 */
546	if (spin_cnt > sleep_cnt)
547		LOCKSTAT_RECORD1(LS_RW_RLOCK_SPIN, rw, (spin_cnt - sleep_cnt));
548#endif
549}
550
551int
552__rw_try_rlock(volatile uintptr_t *c, const char *file, int line)
553{
554	struct rwlock *rw;
555	uintptr_t x;
556
557	if (SCHEDULER_STOPPED())
558		return (1);
559
560	rw = rwlock2rw(c);
561
562	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
563	    ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d",
564	    curthread, rw->lock_object.lo_name, file, line));
565
566	for (;;) {
567		x = rw->rw_lock;
568		KASSERT(rw->rw_lock != RW_DESTROYED,
569		    ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line));
570		if (!(x & RW_LOCK_READ))
571			break;
572		if (atomic_cmpset_acq_ptr(&rw->rw_lock, x, x + RW_ONE_READER)) {
573			LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file,
574			    line);
575			WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line);
576			curthread->td_locks++;
577			curthread->td_rw_rlocks++;
578			return (1);
579		}
580	}
581
582	LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line);
583	return (0);
584}
585
586void
587_rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line)
588{
589	struct rwlock *rw;
590	struct turnstile *ts;
591	uintptr_t x, v, queue;
592
593	if (SCHEDULER_STOPPED())
594		return;
595
596	rw = rwlock2rw(c);
597
598	KASSERT(rw->rw_lock != RW_DESTROYED,
599	    ("rw_runlock() of destroyed rwlock @ %s:%d", file, line));
600	__rw_assert(c, RA_RLOCKED, file, line);
601	WITNESS_UNLOCK(&rw->lock_object, 0, file, line);
602	LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line);
603
604	/* TODO: drop "owner of record" here. */
605
606	for (;;) {
607		/*
608		 * See if there is more than one read lock held.  If so,
609		 * just drop one and return.
610		 */
611		x = rw->rw_lock;
612		if (RW_READERS(x) > 1) {
613			if (atomic_cmpset_rel_ptr(&rw->rw_lock, x,
614			    x - RW_ONE_READER)) {
615				if (LOCK_LOG_TEST(&rw->lock_object, 0))
616					CTR4(KTR_LOCK,
617					    "%s: %p succeeded %p -> %p",
618					    __func__, rw, (void *)x,
619					    (void *)(x - RW_ONE_READER));
620				break;
621			}
622			continue;
623		}
624		/*
625		 * If there aren't any waiters for a write lock, then try
626		 * to drop it quickly.
627		 */
628		if (!(x & RW_LOCK_WAITERS)) {
629			MPASS((x & ~RW_LOCK_WRITE_SPINNER) ==
630			    RW_READERS_LOCK(1));
631			if (atomic_cmpset_rel_ptr(&rw->rw_lock, x,
632			    RW_UNLOCKED)) {
633				if (LOCK_LOG_TEST(&rw->lock_object, 0))
634					CTR2(KTR_LOCK, "%s: %p last succeeded",
635					    __func__, rw);
636				break;
637			}
638			continue;
639		}
640		/*
641		 * Ok, we know we have waiters and we think we are the
642		 * last reader, so grab the turnstile lock.
643		 */
644		turnstile_chain_lock(&rw->lock_object);
645		v = rw->rw_lock & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
646		MPASS(v & RW_LOCK_WAITERS);
647
648		/*
649		 * Try to drop our lock leaving the lock in a unlocked
650		 * state.
651		 *
652		 * If you wanted to do explicit lock handoff you'd have to
653		 * do it here.  You'd also want to use turnstile_signal()
654		 * and you'd have to handle the race where a higher
655		 * priority thread blocks on the write lock before the
656		 * thread you wakeup actually runs and have the new thread
657		 * "steal" the lock.  For now it's a lot simpler to just
658		 * wakeup all of the waiters.
659		 *
660		 * As above, if we fail, then another thread might have
661		 * acquired a read lock, so drop the turnstile lock and
662		 * restart.
663		 */
664		x = RW_UNLOCKED;
665		if (v & RW_LOCK_WRITE_WAITERS) {
666			queue = TS_EXCLUSIVE_QUEUE;
667			x |= (v & RW_LOCK_READ_WAITERS);
668		} else
669			queue = TS_SHARED_QUEUE;
670		if (!atomic_cmpset_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v,
671		    x)) {
672			turnstile_chain_unlock(&rw->lock_object);
673			continue;
674		}
675		if (LOCK_LOG_TEST(&rw->lock_object, 0))
676			CTR2(KTR_LOCK, "%s: %p last succeeded with waiters",
677			    __func__, rw);
678
679		/*
680		 * Ok.  The lock is released and all that's left is to
681		 * wake up the waiters.  Note that the lock might not be
682		 * free anymore, but in that case the writers will just
683		 * block again if they run before the new lock holder(s)
684		 * release the lock.
685		 */
686		ts = turnstile_lookup(&rw->lock_object);
687		MPASS(ts != NULL);
688		turnstile_broadcast(ts, queue);
689		turnstile_unpend(ts, TS_SHARED_LOCK);
690		turnstile_chain_unlock(&rw->lock_object);
691		break;
692	}
693	LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_RUNLOCK_RELEASE, rw);
694	curthread->td_locks--;
695	curthread->td_rw_rlocks--;
696}
697
698/*
699 * This function is called when we are unable to obtain a write lock on the
700 * first try.  This means that at least one other thread holds either a
701 * read or write lock.
702 */
703void
704__rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
705    int line)
706{
707	struct rwlock *rw;
708	struct turnstile *ts;
709#ifdef ADAPTIVE_RWLOCKS
710	volatile struct thread *owner;
711	int spintries = 0;
712	int i;
713#endif
714	uintptr_t v, x;
715#ifdef LOCK_PROFILING
716	uint64_t waittime = 0;
717	int contested = 0;
718#endif
719#ifdef KDTRACE_HOOKS
720	uint64_t spin_cnt = 0;
721	uint64_t sleep_cnt = 0;
722	int64_t sleep_time = 0;
723#endif
724
725	if (SCHEDULER_STOPPED())
726		return;
727
728	rw = rwlock2rw(c);
729
730	if (rw_wlocked(rw)) {
731		KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE,
732		    ("%s: recursing but non-recursive rw %s @ %s:%d\n",
733		    __func__, rw->lock_object.lo_name, file, line));
734		rw->rw_recurse++;
735		if (LOCK_LOG_TEST(&rw->lock_object, 0))
736			CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw);
737		return;
738	}
739
740	if (LOCK_LOG_TEST(&rw->lock_object, 0))
741		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
742		    rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
743
744	while (!_rw_write_lock(rw, tid)) {
745#ifdef KDTRACE_HOOKS
746		spin_cnt++;
747#endif
748#ifdef HWPMC_HOOKS
749		PMC_SOFT_CALL( , , lock, failed);
750#endif
751		lock_profile_obtain_lock_failed(&rw->lock_object,
752		    &contested, &waittime);
753#ifdef ADAPTIVE_RWLOCKS
754		/*
755		 * If the lock is write locked and the owner is
756		 * running on another CPU, spin until the owner stops
757		 * running or the state of the lock changes.
758		 */
759		v = rw->rw_lock;
760		owner = (struct thread *)RW_OWNER(v);
761		if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) {
762			if (LOCK_LOG_TEST(&rw->lock_object, 0))
763				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
764				    __func__, rw, owner);
765			while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
766			    TD_IS_RUNNING(owner)) {
767				cpu_spinwait();
768#ifdef KDTRACE_HOOKS
769				spin_cnt++;
770#endif
771			}
772			continue;
773		}
774		if ((v & RW_LOCK_READ) && RW_READERS(v) &&
775		    spintries < rowner_retries) {
776			if (!(v & RW_LOCK_WRITE_SPINNER)) {
777				if (!atomic_cmpset_ptr(&rw->rw_lock, v,
778				    v | RW_LOCK_WRITE_SPINNER)) {
779					continue;
780				}
781			}
782			spintries++;
783			for (i = 0; i < rowner_loops; i++) {
784				if ((rw->rw_lock & RW_LOCK_WRITE_SPINNER) == 0)
785					break;
786				cpu_spinwait();
787			}
788#ifdef KDTRACE_HOOKS
789			spin_cnt += rowner_loops - i;
790#endif
791			if (i != rowner_loops)
792				continue;
793		}
794#endif
795		ts = turnstile_trywait(&rw->lock_object);
796		v = rw->rw_lock;
797
798#ifdef ADAPTIVE_RWLOCKS
799		/*
800		 * The current lock owner might have started executing
801		 * on another CPU (or the lock could have changed
802		 * owners) while we were waiting on the turnstile
803		 * chain lock.  If so, drop the turnstile lock and try
804		 * again.
805		 */
806		if (!(v & RW_LOCK_READ)) {
807			owner = (struct thread *)RW_OWNER(v);
808			if (TD_IS_RUNNING(owner)) {
809				turnstile_cancel(ts);
810				continue;
811			}
812		}
813#endif
814		/*
815		 * Check for the waiters flags about this rwlock.
816		 * If the lock was released, without maintain any pending
817		 * waiters queue, simply try to acquire it.
818		 * If a pending waiters queue is present, claim the lock
819		 * ownership and maintain the pending queue.
820		 */
821		x = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
822		if ((v & ~x) == RW_UNLOCKED) {
823			x &= ~RW_LOCK_WRITE_SPINNER;
824			if (atomic_cmpset_acq_ptr(&rw->rw_lock, v, tid | x)) {
825				if (x)
826					turnstile_claim(ts);
827				else
828					turnstile_cancel(ts);
829				break;
830			}
831			turnstile_cancel(ts);
832			continue;
833		}
834		/*
835		 * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to
836		 * set it.  If we fail to set it, then loop back and try
837		 * again.
838		 */
839		if (!(v & RW_LOCK_WRITE_WAITERS)) {
840			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
841			    v | RW_LOCK_WRITE_WAITERS)) {
842				turnstile_cancel(ts);
843				continue;
844			}
845			if (LOCK_LOG_TEST(&rw->lock_object, 0))
846				CTR2(KTR_LOCK, "%s: %p set write waiters flag",
847				    __func__, rw);
848		}
849		/*
850		 * We were unable to acquire the lock and the write waiters
851		 * flag is set, so we must block on the turnstile.
852		 */
853		if (LOCK_LOG_TEST(&rw->lock_object, 0))
854			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
855			    rw);
856#ifdef KDTRACE_HOOKS
857		sleep_time -= lockstat_nsecs();
858#endif
859		turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE);
860#ifdef KDTRACE_HOOKS
861		sleep_time += lockstat_nsecs();
862		sleep_cnt++;
863#endif
864		if (LOCK_LOG_TEST(&rw->lock_object, 0))
865			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
866			    __func__, rw);
867#ifdef ADAPTIVE_RWLOCKS
868		spintries = 0;
869#endif
870	}
871	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_WLOCK_ACQUIRE, rw, contested,
872	    waittime, file, line);
873#ifdef KDTRACE_HOOKS
874	if (sleep_time)
875		LOCKSTAT_RECORD1(LS_RW_WLOCK_BLOCK, rw, sleep_time);
876
877	/*
878	 * Record only the loops spinning and not sleeping.
879	 */
880	if (spin_cnt > sleep_cnt)
881		LOCKSTAT_RECORD1(LS_RW_WLOCK_SPIN, rw, (spin_cnt - sleep_cnt));
882#endif
883}
884
885/*
886 * This function is called if the first try at releasing a write lock failed.
887 * This means that one of the 2 waiter bits must be set indicating that at
888 * least one thread is waiting on this lock.
889 */
890void
891__rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
892    int line)
893{
894	struct rwlock *rw;
895	struct turnstile *ts;
896	uintptr_t v;
897	int queue;
898
899	if (SCHEDULER_STOPPED())
900		return;
901
902	rw = rwlock2rw(c);
903
904	if (rw_wlocked(rw) && rw_recursed(rw)) {
905		rw->rw_recurse--;
906		if (LOCK_LOG_TEST(&rw->lock_object, 0))
907			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw);
908		return;
909	}
910
911	KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS),
912	    ("%s: neither of the waiter flags are set", __func__));
913
914	if (LOCK_LOG_TEST(&rw->lock_object, 0))
915		CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
916
917	turnstile_chain_lock(&rw->lock_object);
918	ts = turnstile_lookup(&rw->lock_object);
919	MPASS(ts != NULL);
920
921	/*
922	 * Use the same algo as sx locks for now.  Prefer waking up shared
923	 * waiters if we have any over writers.  This is probably not ideal.
924	 *
925	 * 'v' is the value we are going to write back to rw_lock.  If we
926	 * have waiters on both queues, we need to preserve the state of
927	 * the waiter flag for the queue we don't wake up.  For now this is
928	 * hardcoded for the algorithm mentioned above.
929	 *
930	 * In the case of both readers and writers waiting we wakeup the
931	 * readers but leave the RW_LOCK_WRITE_WAITERS flag set.  If a
932	 * new writer comes in before a reader it will claim the lock up
933	 * above.  There is probably a potential priority inversion in
934	 * there that could be worked around either by waking both queues
935	 * of waiters or doing some complicated lock handoff gymnastics.
936	 */
937	v = RW_UNLOCKED;
938	if (rw->rw_lock & RW_LOCK_WRITE_WAITERS) {
939		queue = TS_EXCLUSIVE_QUEUE;
940		v |= (rw->rw_lock & RW_LOCK_READ_WAITERS);
941	} else
942		queue = TS_SHARED_QUEUE;
943
944	/* Wake up all waiters for the specific queue. */
945	if (LOCK_LOG_TEST(&rw->lock_object, 0))
946		CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw,
947		    queue == TS_SHARED_QUEUE ? "read" : "write");
948	turnstile_broadcast(ts, queue);
949	atomic_store_rel_ptr(&rw->rw_lock, v);
950	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
951	turnstile_chain_unlock(&rw->lock_object);
952}
953
954/*
955 * Attempt to do a non-blocking upgrade from a read lock to a write
956 * lock.  This will only succeed if this thread holds a single read
957 * lock.  Returns true if the upgrade succeeded and false otherwise.
958 */
959int
960__rw_try_upgrade(volatile uintptr_t *c, const char *file, int line)
961{
962	struct rwlock *rw;
963	uintptr_t v, x, tid;
964	struct turnstile *ts;
965	int success;
966
967	if (SCHEDULER_STOPPED())
968		return (1);
969
970	rw = rwlock2rw(c);
971
972	KASSERT(rw->rw_lock != RW_DESTROYED,
973	    ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line));
974	__rw_assert(c, RA_RLOCKED, file, line);
975
976	/*
977	 * Attempt to switch from one reader to a writer.  If there
978	 * are any write waiters, then we will have to lock the
979	 * turnstile first to prevent races with another writer
980	 * calling turnstile_wait() before we have claimed this
981	 * turnstile.  So, do the simple case of no waiters first.
982	 */
983	tid = (uintptr_t)curthread;
984	success = 0;
985	for (;;) {
986		v = rw->rw_lock;
987		if (RW_READERS(v) > 1)
988			break;
989		if (!(v & RW_LOCK_WAITERS)) {
990			success = atomic_cmpset_ptr(&rw->rw_lock, v, tid);
991			if (!success)
992				continue;
993			break;
994		}
995
996		/*
997		 * Ok, we think we have waiters, so lock the turnstile.
998		 */
999		ts = turnstile_trywait(&rw->lock_object);
1000		v = rw->rw_lock;
1001		if (RW_READERS(v) > 1) {
1002			turnstile_cancel(ts);
1003			break;
1004		}
1005		/*
1006		 * Try to switch from one reader to a writer again.  This time
1007		 * we honor the current state of the waiters flags.
1008		 * If we obtain the lock with the flags set, then claim
1009		 * ownership of the turnstile.
1010		 */
1011		x = rw->rw_lock & RW_LOCK_WAITERS;
1012		success = atomic_cmpset_ptr(&rw->rw_lock, v, tid | x);
1013		if (success) {
1014			if (x)
1015				turnstile_claim(ts);
1016			else
1017				turnstile_cancel(ts);
1018			break;
1019		}
1020		turnstile_cancel(ts);
1021	}
1022	LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
1023	if (success) {
1024		curthread->td_rw_rlocks--;
1025		WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
1026		    file, line);
1027		LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, rw);
1028	}
1029	return (success);
1030}
1031
1032/*
1033 * Downgrade a write lock into a single read lock.
1034 */
1035void
1036__rw_downgrade(volatile uintptr_t *c, const char *file, int line)
1037{
1038	struct rwlock *rw;
1039	struct turnstile *ts;
1040	uintptr_t tid, v;
1041	int rwait, wwait;
1042
1043	if (SCHEDULER_STOPPED())
1044		return;
1045
1046	rw = rwlock2rw(c);
1047
1048	KASSERT(rw->rw_lock != RW_DESTROYED,
1049	    ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line));
1050	__rw_assert(c, RA_WLOCKED | RA_NOTRECURSED, file, line);
1051#ifndef INVARIANTS
1052	if (rw_recursed(rw))
1053		panic("downgrade of a recursed lock");
1054#endif
1055
1056	WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line);
1057
1058	/*
1059	 * Convert from a writer to a single reader.  First we handle
1060	 * the easy case with no waiters.  If there are any waiters, we
1061	 * lock the turnstile and "disown" the lock.
1062	 */
1063	tid = (uintptr_t)curthread;
1064	if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1)))
1065		goto out;
1066
1067	/*
1068	 * Ok, we think we have waiters, so lock the turnstile so we can
1069	 * read the waiter flags without any races.
1070	 */
1071	turnstile_chain_lock(&rw->lock_object);
1072	v = rw->rw_lock & RW_LOCK_WAITERS;
1073	rwait = v & RW_LOCK_READ_WAITERS;
1074	wwait = v & RW_LOCK_WRITE_WAITERS;
1075	MPASS(rwait | wwait);
1076
1077	/*
1078	 * Downgrade from a write lock while preserving waiters flag
1079	 * and give up ownership of the turnstile.
1080	 */
1081	ts = turnstile_lookup(&rw->lock_object);
1082	MPASS(ts != NULL);
1083	if (!wwait)
1084		v &= ~RW_LOCK_READ_WAITERS;
1085	atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v);
1086	/*
1087	 * Wake other readers if there are no writers pending.  Otherwise they
1088	 * won't be able to acquire the lock anyway.
1089	 */
1090	if (rwait && !wwait) {
1091		turnstile_broadcast(ts, TS_SHARED_QUEUE);
1092		turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
1093	} else
1094		turnstile_disown(ts);
1095	turnstile_chain_unlock(&rw->lock_object);
1096out:
1097	curthread->td_rw_rlocks++;
1098	LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
1099	LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, rw);
1100}
1101
1102#ifdef INVARIANT_SUPPORT
1103#ifndef INVARIANTS
1104#undef __rw_assert
1105#endif
1106
1107/*
1108 * In the non-WITNESS case, rw_assert() can only detect that at least
1109 * *some* thread owns an rlock, but it cannot guarantee that *this*
1110 * thread owns an rlock.
1111 */
1112void
1113__rw_assert(const volatile uintptr_t *c, int what, const char *file, int line)
1114{
1115	const struct rwlock *rw;
1116
1117	if (panicstr != NULL)
1118		return;
1119
1120	rw = rwlock2rw(c);
1121
1122	switch (what) {
1123	case RA_LOCKED:
1124	case RA_LOCKED | RA_RECURSED:
1125	case RA_LOCKED | RA_NOTRECURSED:
1126	case RA_RLOCKED:
1127	case RA_RLOCKED | RA_RECURSED:
1128	case RA_RLOCKED | RA_NOTRECURSED:
1129#ifdef WITNESS
1130		witness_assert(&rw->lock_object, what, file, line);
1131#else
1132		/*
1133		 * If some other thread has a write lock or we have one
1134		 * and are asserting a read lock, fail.  Also, if no one
1135		 * has a lock at all, fail.
1136		 */
1137		if (rw->rw_lock == RW_UNLOCKED ||
1138		    (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED ||
1139		    rw_wowner(rw) != curthread)))
1140			panic("Lock %s not %slocked @ %s:%d\n",
1141			    rw->lock_object.lo_name, (what & RA_RLOCKED) ?
1142			    "read " : "", file, line);
1143
1144		if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) {
1145			if (rw_recursed(rw)) {
1146				if (what & RA_NOTRECURSED)
1147					panic("Lock %s recursed @ %s:%d\n",
1148					    rw->lock_object.lo_name, file,
1149					    line);
1150			} else if (what & RA_RECURSED)
1151				panic("Lock %s not recursed @ %s:%d\n",
1152				    rw->lock_object.lo_name, file, line);
1153		}
1154#endif
1155		break;
1156	case RA_WLOCKED:
1157	case RA_WLOCKED | RA_RECURSED:
1158	case RA_WLOCKED | RA_NOTRECURSED:
1159		if (rw_wowner(rw) != curthread)
1160			panic("Lock %s not exclusively locked @ %s:%d\n",
1161			    rw->lock_object.lo_name, file, line);
1162		if (rw_recursed(rw)) {
1163			if (what & RA_NOTRECURSED)
1164				panic("Lock %s recursed @ %s:%d\n",
1165				    rw->lock_object.lo_name, file, line);
1166		} else if (what & RA_RECURSED)
1167			panic("Lock %s not recursed @ %s:%d\n",
1168			    rw->lock_object.lo_name, file, line);
1169		break;
1170	case RA_UNLOCKED:
1171#ifdef WITNESS
1172		witness_assert(&rw->lock_object, what, file, line);
1173#else
1174		/*
1175		 * If we hold a write lock fail.  We can't reliably check
1176		 * to see if we hold a read lock or not.
1177		 */
1178		if (rw_wowner(rw) == curthread)
1179			panic("Lock %s exclusively locked @ %s:%d\n",
1180			    rw->lock_object.lo_name, file, line);
1181#endif
1182		break;
1183	default:
1184		panic("Unknown rw lock assertion: %d @ %s:%d", what, file,
1185		    line);
1186	}
1187}
1188#endif /* INVARIANT_SUPPORT */
1189
1190#ifdef DDB
1191void
1192db_show_rwlock(const struct lock_object *lock)
1193{
1194	const struct rwlock *rw;
1195	struct thread *td;
1196
1197	rw = (const struct rwlock *)lock;
1198
1199	db_printf(" state: ");
1200	if (rw->rw_lock == RW_UNLOCKED)
1201		db_printf("UNLOCKED\n");
1202	else if (rw->rw_lock == RW_DESTROYED) {
1203		db_printf("DESTROYED\n");
1204		return;
1205	} else if (rw->rw_lock & RW_LOCK_READ)
1206		db_printf("RLOCK: %ju locks\n",
1207		    (uintmax_t)(RW_READERS(rw->rw_lock)));
1208	else {
1209		td = rw_wowner(rw);
1210		db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
1211		    td->td_tid, td->td_proc->p_pid, td->td_name);
1212		if (rw_recursed(rw))
1213			db_printf(" recursed: %u\n", rw->rw_recurse);
1214	}
1215	db_printf(" waiters: ");
1216	switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) {
1217	case RW_LOCK_READ_WAITERS:
1218		db_printf("readers\n");
1219		break;
1220	case RW_LOCK_WRITE_WAITERS:
1221		db_printf("writers\n");
1222		break;
1223	case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS:
1224		db_printf("readers and writers\n");
1225		break;
1226	default:
1227		db_printf("none\n");
1228		break;
1229	}
1230}
1231
1232#endif
1233