rrwlock.c revision 211932
1185029Spjd/* 2185029Spjd * CDDL HEADER START 3185029Spjd * 4185029Spjd * The contents of this file are subject to the terms of the 5185029Spjd * Common Development and Distribution License (the "License"). 6185029Spjd * You may not use this file except in compliance with the License. 7185029Spjd * 8185029Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9185029Spjd * or http://www.opensolaris.org/os/licensing. 10185029Spjd * See the License for the specific language governing permissions 11185029Spjd * and limitations under the License. 12185029Spjd * 13185029Spjd * When distributing Covered Code, include this CDDL HEADER in each 14185029Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15185029Spjd * If applicable, add the following below this CDDL HEADER, with the 16185029Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17185029Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18185029Spjd * 19185029Spjd * CDDL HEADER END 20185029Spjd */ 21185029Spjd/* 22211932Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23185029Spjd * Use is subject to license terms. 24185029Spjd */ 25185029Spjd 26185029Spjd#include <sys/refcount.h> 27185029Spjd#include <sys/rrwlock.h> 28185029Spjd 29185029Spjd/* 30185029Spjd * This file contains the implementation of a re-entrant read 31185029Spjd * reader/writer lock (aka "rrwlock"). 32185029Spjd * 33185029Spjd * This is a normal reader/writer lock with the additional feature 34185029Spjd * of allowing threads who have already obtained a read lock to 35185029Spjd * re-enter another read lock (re-entrant read) - even if there are 36185029Spjd * waiting writers. 37185029Spjd * 38185029Spjd * Callers who have not obtained a read lock give waiting writers priority. 39185029Spjd * 40185029Spjd * The rrwlock_t lock does not allow re-entrant writers, nor does it 41185029Spjd * allow a re-entrant mix of reads and writes (that is, it does not 42185029Spjd * allow a caller who has already obtained a read lock to be able to 43185029Spjd * then grab a write lock without first dropping all read locks, and 44185029Spjd * vice versa). 45185029Spjd * 46185029Spjd * The rrwlock_t uses tsd (thread specific data) to keep a list of 47185029Spjd * nodes (rrw_node_t), where each node keeps track of which specific 48185029Spjd * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering 49185029Spjd * should be rare, a thread that grabs multiple reads on the same rrwlock_t 50185029Spjd * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the 51185029Spjd * tsd list can represent a different rrwlock_t. This allows a thread 52185029Spjd * to enter multiple and unique rrwlock_ts for read locks at the same time. 53185029Spjd * 54185029Spjd * Since using tsd exposes some overhead, the rrwlock_t only needs to 55185029Spjd * keep tsd data when writers are waiting. If no writers are waiting, then 56185029Spjd * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd 57185029Spjd * is needed. Once a writer attempts to grab the lock, readers then 58185029Spjd * keep tsd data and bump the linked readers count (rr_linked_rcount). 59185029Spjd * 60185029Spjd * If there are waiting writers and there are anonymous readers, then a 61185029Spjd * reader doesn't know if it is a re-entrant lock. But since it may be one, 62185029Spjd * we allow the read to proceed (otherwise it could deadlock). Since once 63185029Spjd * waiting writers are active, readers no longer bump the anonymous count, 64185029Spjd * the anonymous readers will eventually flush themselves out. At this point, 65185029Spjd * readers will be able to tell if they are a re-entrant lock (have a 66185029Spjd * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then 67185029Spjd * we must let the proceed. If they are not, then the reader blocks for the 68185029Spjd * waiting writers. Hence, we do not starve writers. 69185029Spjd */ 70185029Spjd 71185029Spjd/* global key for TSD */ 72185029Spjduint_t rrw_tsd_key; 73185029Spjd 74185029Spjdtypedef struct rrw_node { 75185029Spjd struct rrw_node *rn_next; 76185029Spjd rrwlock_t *rn_rrl; 77185029Spjd} rrw_node_t; 78185029Spjd 79185029Spjdstatic rrw_node_t * 80185029Spjdrrn_find(rrwlock_t *rrl) 81185029Spjd{ 82185029Spjd rrw_node_t *rn; 83185029Spjd 84185029Spjd if (refcount_count(&rrl->rr_linked_rcount) == 0) 85211932Smm return (B_FALSE); 86185029Spjd 87185029Spjd for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 88185029Spjd if (rn->rn_rrl == rrl) 89185029Spjd return (rn); 90185029Spjd } 91185029Spjd return (NULL); 92185029Spjd} 93185029Spjd 94185029Spjd/* 95185029Spjd * Add a node to the head of the singly linked list. 96185029Spjd */ 97185029Spjdstatic void 98185029Spjdrrn_add(rrwlock_t *rrl) 99185029Spjd{ 100185029Spjd rrw_node_t *rn; 101185029Spjd 102185029Spjd rn = kmem_alloc(sizeof (*rn), KM_SLEEP); 103185029Spjd rn->rn_rrl = rrl; 104185029Spjd rn->rn_next = tsd_get(rrw_tsd_key); 105185029Spjd VERIFY(tsd_set(rrw_tsd_key, rn) == 0); 106185029Spjd} 107185029Spjd 108185029Spjd/* 109185029Spjd * If a node is found for 'rrl', then remove the node from this 110185029Spjd * thread's list and return TRUE; otherwise return FALSE. 111185029Spjd */ 112185029Spjdstatic boolean_t 113185029Spjdrrn_find_and_remove(rrwlock_t *rrl) 114185029Spjd{ 115185029Spjd rrw_node_t *rn; 116185029Spjd rrw_node_t *prev = NULL; 117185029Spjd 118185029Spjd if (refcount_count(&rrl->rr_linked_rcount) == 0) 119185029Spjd return (B_FALSE); 120185029Spjd 121185029Spjd for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 122185029Spjd if (rn->rn_rrl == rrl) { 123185029Spjd if (prev) 124185029Spjd prev->rn_next = rn->rn_next; 125185029Spjd else 126185029Spjd VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); 127185029Spjd kmem_free(rn, sizeof (*rn)); 128185029Spjd return (B_TRUE); 129185029Spjd } 130185029Spjd prev = rn; 131185029Spjd } 132185029Spjd return (B_FALSE); 133185029Spjd} 134185029Spjd 135185029Spjdvoid 136185029Spjdrrw_init(rrwlock_t *rrl) 137185029Spjd{ 138185029Spjd mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); 139185029Spjd cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); 140185029Spjd rrl->rr_writer = NULL; 141185029Spjd refcount_create(&rrl->rr_anon_rcount); 142185029Spjd refcount_create(&rrl->rr_linked_rcount); 143185029Spjd rrl->rr_writer_wanted = B_FALSE; 144185029Spjd} 145185029Spjd 146185029Spjdvoid 147185029Spjdrrw_destroy(rrwlock_t *rrl) 148185029Spjd{ 149185029Spjd mutex_destroy(&rrl->rr_lock); 150185029Spjd cv_destroy(&rrl->rr_cv); 151185029Spjd ASSERT(rrl->rr_writer == NULL); 152185029Spjd refcount_destroy(&rrl->rr_anon_rcount); 153185029Spjd refcount_destroy(&rrl->rr_linked_rcount); 154185029Spjd} 155185029Spjd 156185029Spjdstatic void 157185029Spjdrrw_enter_read(rrwlock_t *rrl, void *tag) 158185029Spjd{ 159185029Spjd mutex_enter(&rrl->rr_lock); 160211932Smm#if !defined(DEBUG) && defined(_KERNEL) 161211932Smm if (!rrl->rr_writer && !rrl->rr_writer_wanted) { 162211932Smm rrl->rr_anon_rcount.rc_count++; 163211932Smm mutex_exit(&rrl->rr_lock); 164211932Smm return; 165211932Smm } 166211932Smm DTRACE_PROBE(zfs__rrwfastpath__rdmiss); 167211932Smm#endif 168185029Spjd ASSERT(rrl->rr_writer != curthread); 169185029Spjd ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); 170185029Spjd 171185029Spjd while (rrl->rr_writer || (rrl->rr_writer_wanted && 172185029Spjd refcount_is_zero(&rrl->rr_anon_rcount) && 173185029Spjd rrn_find(rrl) == NULL)) 174185029Spjd cv_wait(&rrl->rr_cv, &rrl->rr_lock); 175185029Spjd 176185029Spjd if (rrl->rr_writer_wanted) { 177185029Spjd /* may or may not be a re-entrant enter */ 178185029Spjd rrn_add(rrl); 179185029Spjd (void) refcount_add(&rrl->rr_linked_rcount, tag); 180185029Spjd } else { 181185029Spjd (void) refcount_add(&rrl->rr_anon_rcount, tag); 182185029Spjd } 183185029Spjd ASSERT(rrl->rr_writer == NULL); 184185029Spjd mutex_exit(&rrl->rr_lock); 185185029Spjd} 186185029Spjd 187185029Spjdstatic void 188185029Spjdrrw_enter_write(rrwlock_t *rrl) 189185029Spjd{ 190185029Spjd mutex_enter(&rrl->rr_lock); 191185029Spjd ASSERT(rrl->rr_writer != curthread); 192185029Spjd 193185029Spjd while (refcount_count(&rrl->rr_anon_rcount) > 0 || 194185029Spjd refcount_count(&rrl->rr_linked_rcount) > 0 || 195185029Spjd rrl->rr_writer != NULL) { 196185029Spjd rrl->rr_writer_wanted = B_TRUE; 197185029Spjd cv_wait(&rrl->rr_cv, &rrl->rr_lock); 198185029Spjd } 199185029Spjd rrl->rr_writer_wanted = B_FALSE; 200185029Spjd rrl->rr_writer = curthread; 201185029Spjd mutex_exit(&rrl->rr_lock); 202185029Spjd} 203185029Spjd 204185029Spjdvoid 205185029Spjdrrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) 206185029Spjd{ 207185029Spjd if (rw == RW_READER) 208185029Spjd rrw_enter_read(rrl, tag); 209185029Spjd else 210185029Spjd rrw_enter_write(rrl); 211185029Spjd} 212185029Spjd 213185029Spjdvoid 214185029Spjdrrw_exit(rrwlock_t *rrl, void *tag) 215185029Spjd{ 216185029Spjd mutex_enter(&rrl->rr_lock); 217211932Smm#if !defined(DEBUG) && defined(_KERNEL) 218211932Smm if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { 219211932Smm rrl->rr_anon_rcount.rc_count--; 220211932Smm if (rrl->rr_anon_rcount.rc_count == 0) 221211932Smm cv_broadcast(&rrl->rr_cv); 222211932Smm mutex_exit(&rrl->rr_lock); 223211932Smm return; 224211932Smm } 225211932Smm DTRACE_PROBE(zfs__rrwfastpath__exitmiss); 226211932Smm#endif 227185029Spjd ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || 228185029Spjd !refcount_is_zero(&rrl->rr_linked_rcount) || 229185029Spjd rrl->rr_writer != NULL); 230185029Spjd 231185029Spjd if (rrl->rr_writer == NULL) { 232211932Smm int64_t count; 233211932Smm if (rrn_find_and_remove(rrl)) 234211932Smm count = refcount_remove(&rrl->rr_linked_rcount, tag); 235211932Smm else 236211932Smm count = refcount_remove(&rrl->rr_anon_rcount, tag); 237211932Smm if (count == 0) 238211932Smm cv_broadcast(&rrl->rr_cv); 239185029Spjd } else { 240185029Spjd ASSERT(rrl->rr_writer == curthread); 241185029Spjd ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && 242185029Spjd refcount_is_zero(&rrl->rr_linked_rcount)); 243185029Spjd rrl->rr_writer = NULL; 244185029Spjd cv_broadcast(&rrl->rr_cv); 245185029Spjd } 246185029Spjd mutex_exit(&rrl->rr_lock); 247185029Spjd} 248185029Spjd 249185029Spjdboolean_t 250185029Spjdrrw_held(rrwlock_t *rrl, krw_t rw) 251185029Spjd{ 252185029Spjd boolean_t held; 253185029Spjd 254185029Spjd mutex_enter(&rrl->rr_lock); 255185029Spjd if (rw == RW_WRITER) { 256185029Spjd held = (rrl->rr_writer == curthread); 257185029Spjd } else { 258185029Spjd held = (!refcount_is_zero(&rrl->rr_anon_rcount) || 259185029Spjd !refcount_is_zero(&rrl->rr_linked_rcount)); 260185029Spjd } 261185029Spjd mutex_exit(&rrl->rr_lock); 262185029Spjd 263185029Spjd return (held); 264185029Spjd} 265