1185029Spjd/* 2185029Spjd * CDDL HEADER START 3185029Spjd * 4185029Spjd * The contents of this file are subject to the terms of the 5185029Spjd * Common Development and Distribution License (the "License"). 6185029Spjd * You may not use this file except in compliance with the License. 7185029Spjd * 8185029Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9185029Spjd * or http://www.opensolaris.org/os/licensing. 10185029Spjd * See the License for the specific language governing permissions 11185029Spjd * and limitations under the License. 12185029Spjd * 13185029Spjd * When distributing Covered Code, include this CDDL HEADER in each 14185029Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15185029Spjd * If applicable, add the following below this CDDL HEADER, with the 16185029Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17185029Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18185029Spjd * 19185029Spjd * CDDL HEADER END 20185029Spjd */ 21185029Spjd/* 22211932Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23185029Spjd * Use is subject to license terms. 24185029Spjd */ 25248571Smm/* 26248571Smm * Copyright (c) 2012 by Delphix. All rights reserved. 27248571Smm */ 28185029Spjd 29185029Spjd#include <sys/refcount.h> 30185029Spjd#include <sys/rrwlock.h> 31185029Spjd 32185029Spjd/* 33185029Spjd * This file contains the implementation of a re-entrant read 34185029Spjd * reader/writer lock (aka "rrwlock"). 35185029Spjd * 36185029Spjd * This is a normal reader/writer lock with the additional feature 37185029Spjd * of allowing threads who have already obtained a read lock to 38185029Spjd * re-enter another read lock (re-entrant read) - even if there are 39185029Spjd * waiting writers. 40185029Spjd * 41185029Spjd * Callers who have not obtained a read lock give waiting writers priority. 42185029Spjd * 43185029Spjd * The rrwlock_t lock does not allow re-entrant writers, nor does it 44185029Spjd * allow a re-entrant mix of reads and writes (that is, it does not 45185029Spjd * allow a caller who has already obtained a read lock to be able to 46185029Spjd * then grab a write lock without first dropping all read locks, and 47185029Spjd * vice versa). 48185029Spjd * 49185029Spjd * The rrwlock_t uses tsd (thread specific data) to keep a list of 50185029Spjd * nodes (rrw_node_t), where each node keeps track of which specific 51185029Spjd * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering 52185029Spjd * should be rare, a thread that grabs multiple reads on the same rrwlock_t 53185029Spjd * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the 54185029Spjd * tsd list can represent a different rrwlock_t. This allows a thread 55185029Spjd * to enter multiple and unique rrwlock_ts for read locks at the same time. 56185029Spjd * 57185029Spjd * Since using tsd exposes some overhead, the rrwlock_t only needs to 58185029Spjd * keep tsd data when writers are waiting. If no writers are waiting, then 59185029Spjd * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd 60185029Spjd * is needed. Once a writer attempts to grab the lock, readers then 61185029Spjd * keep tsd data and bump the linked readers count (rr_linked_rcount). 62185029Spjd * 63185029Spjd * If there are waiting writers and there are anonymous readers, then a 64185029Spjd * reader doesn't know if it is a re-entrant lock. But since it may be one, 65185029Spjd * we allow the read to proceed (otherwise it could deadlock). Since once 66185029Spjd * waiting writers are active, readers no longer bump the anonymous count, 67185029Spjd * the anonymous readers will eventually flush themselves out. At this point, 68185029Spjd * readers will be able to tell if they are a re-entrant lock (have a 69185029Spjd * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then 70185029Spjd * we must let the proceed. If they are not, then the reader blocks for the 71185029Spjd * waiting writers. Hence, we do not starve writers. 72185029Spjd */ 73185029Spjd 74185029Spjd/* global key for TSD */ 75185029Spjduint_t rrw_tsd_key; 76185029Spjd 77185029Spjdtypedef struct rrw_node { 78248571Smm struct rrw_node *rn_next; 79248571Smm rrwlock_t *rn_rrl; 80248571Smm void *rn_tag; 81185029Spjd} rrw_node_t; 82185029Spjd 83185029Spjdstatic rrw_node_t * 84185029Spjdrrn_find(rrwlock_t *rrl) 85185029Spjd{ 86185029Spjd rrw_node_t *rn; 87185029Spjd 88185029Spjd if (refcount_count(&rrl->rr_linked_rcount) == 0) 89211948Spjd return (NULL); 90185029Spjd 91185029Spjd for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 92185029Spjd if (rn->rn_rrl == rrl) 93185029Spjd return (rn); 94185029Spjd } 95185029Spjd return (NULL); 96185029Spjd} 97185029Spjd 98185029Spjd/* 99185029Spjd * Add a node to the head of the singly linked list. 100185029Spjd */ 101185029Spjdstatic void 102248571Smmrrn_add(rrwlock_t *rrl, void *tag) 103185029Spjd{ 104185029Spjd rrw_node_t *rn; 105185029Spjd 106185029Spjd rn = kmem_alloc(sizeof (*rn), KM_SLEEP); 107185029Spjd rn->rn_rrl = rrl; 108185029Spjd rn->rn_next = tsd_get(rrw_tsd_key); 109248571Smm rn->rn_tag = tag; 110185029Spjd VERIFY(tsd_set(rrw_tsd_key, rn) == 0); 111185029Spjd} 112185029Spjd 113185029Spjd/* 114185029Spjd * If a node is found for 'rrl', then remove the node from this 115185029Spjd * thread's list and return TRUE; otherwise return FALSE. 116185029Spjd */ 117185029Spjdstatic boolean_t 118248571Smmrrn_find_and_remove(rrwlock_t *rrl, void *tag) 119185029Spjd{ 120185029Spjd rrw_node_t *rn; 121185029Spjd rrw_node_t *prev = NULL; 122185029Spjd 123185029Spjd if (refcount_count(&rrl->rr_linked_rcount) == 0) 124185029Spjd return (B_FALSE); 125185029Spjd 126185029Spjd for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 127248571Smm if (rn->rn_rrl == rrl && rn->rn_tag == tag) { 128185029Spjd if (prev) 129185029Spjd prev->rn_next = rn->rn_next; 130185029Spjd else 131185029Spjd VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); 132185029Spjd kmem_free(rn, sizeof (*rn)); 133185029Spjd return (B_TRUE); 134185029Spjd } 135185029Spjd prev = rn; 136185029Spjd } 137185029Spjd return (B_FALSE); 138185029Spjd} 139185029Spjd 140185029Spjdvoid 141248571Smmrrw_init(rrwlock_t *rrl, boolean_t track_all) 142185029Spjd{ 143185029Spjd mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); 144185029Spjd cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); 145185029Spjd rrl->rr_writer = NULL; 146185029Spjd refcount_create(&rrl->rr_anon_rcount); 147185029Spjd refcount_create(&rrl->rr_linked_rcount); 148185029Spjd rrl->rr_writer_wanted = B_FALSE; 149248571Smm rrl->rr_track_all = track_all; 150185029Spjd} 151185029Spjd 152185029Spjdvoid 153185029Spjdrrw_destroy(rrwlock_t *rrl) 154185029Spjd{ 155185029Spjd mutex_destroy(&rrl->rr_lock); 156185029Spjd cv_destroy(&rrl->rr_cv); 157185029Spjd ASSERT(rrl->rr_writer == NULL); 158185029Spjd refcount_destroy(&rrl->rr_anon_rcount); 159185029Spjd refcount_destroy(&rrl->rr_linked_rcount); 160185029Spjd} 161185029Spjd 162286689Smavstatic void 163286689Smavrrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag) 164185029Spjd{ 165185029Spjd mutex_enter(&rrl->rr_lock); 166211932Smm#if !defined(DEBUG) && defined(_KERNEL) 167248571Smm if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && 168248571Smm !rrl->rr_track_all) { 169211932Smm rrl->rr_anon_rcount.rc_count++; 170211932Smm mutex_exit(&rrl->rr_lock); 171211932Smm return; 172211932Smm } 173211932Smm DTRACE_PROBE(zfs__rrwfastpath__rdmiss); 174211932Smm#endif 175185029Spjd ASSERT(rrl->rr_writer != curthread); 176185029Spjd ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); 177185029Spjd 178248571Smm while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && 179286689Smav refcount_is_zero(&rrl->rr_anon_rcount) && !prio && 180185029Spjd rrn_find(rrl) == NULL)) 181185029Spjd cv_wait(&rrl->rr_cv, &rrl->rr_lock); 182185029Spjd 183248571Smm if (rrl->rr_writer_wanted || rrl->rr_track_all) { 184185029Spjd /* may or may not be a re-entrant enter */ 185248571Smm rrn_add(rrl, tag); 186185029Spjd (void) refcount_add(&rrl->rr_linked_rcount, tag); 187185029Spjd } else { 188185029Spjd (void) refcount_add(&rrl->rr_anon_rcount, tag); 189185029Spjd } 190185029Spjd ASSERT(rrl->rr_writer == NULL); 191185029Spjd mutex_exit(&rrl->rr_lock); 192185029Spjd} 193185029Spjd 194248571Smmvoid 195286689Smavrrw_enter_read(rrwlock_t *rrl, void *tag) 196286689Smav{ 197286689Smav rrw_enter_read_impl(rrl, B_FALSE, tag); 198286689Smav} 199286689Smav 200286689Smav/* 201286689Smav * take a read lock even if there are pending write lock requests. if we want 202286689Smav * to take a lock reentrantly, but from different threads (that have a 203286689Smav * relationship to each other), the normal detection mechanism to overrule 204286689Smav * the pending writer does not work, so we have to give an explicit hint here. 205286689Smav */ 206286689Smavvoid 207286689Smavrrw_enter_read_prio(rrwlock_t *rrl, void *tag) 208286689Smav{ 209286689Smav rrw_enter_read_impl(rrl, B_TRUE, tag); 210286689Smav} 211286689Smav 212286689Smav 213286689Smavvoid 214185029Spjdrrw_enter_write(rrwlock_t *rrl) 215185029Spjd{ 216185029Spjd mutex_enter(&rrl->rr_lock); 217185029Spjd ASSERT(rrl->rr_writer != curthread); 218185029Spjd 219185029Spjd while (refcount_count(&rrl->rr_anon_rcount) > 0 || 220185029Spjd refcount_count(&rrl->rr_linked_rcount) > 0 || 221185029Spjd rrl->rr_writer != NULL) { 222185029Spjd rrl->rr_writer_wanted = B_TRUE; 223185029Spjd cv_wait(&rrl->rr_cv, &rrl->rr_lock); 224185029Spjd } 225185029Spjd rrl->rr_writer_wanted = B_FALSE; 226185029Spjd rrl->rr_writer = curthread; 227185029Spjd mutex_exit(&rrl->rr_lock); 228185029Spjd} 229185029Spjd 230185029Spjdvoid 231185029Spjdrrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) 232185029Spjd{ 233185029Spjd if (rw == RW_READER) 234185029Spjd rrw_enter_read(rrl, tag); 235185029Spjd else 236185029Spjd rrw_enter_write(rrl); 237185029Spjd} 238185029Spjd 239185029Spjdvoid 240185029Spjdrrw_exit(rrwlock_t *rrl, void *tag) 241185029Spjd{ 242185029Spjd mutex_enter(&rrl->rr_lock); 243211932Smm#if !defined(DEBUG) && defined(_KERNEL) 244211932Smm if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { 245211932Smm rrl->rr_anon_rcount.rc_count--; 246211932Smm if (rrl->rr_anon_rcount.rc_count == 0) 247211932Smm cv_broadcast(&rrl->rr_cv); 248211932Smm mutex_exit(&rrl->rr_lock); 249211932Smm return; 250211932Smm } 251211932Smm DTRACE_PROBE(zfs__rrwfastpath__exitmiss); 252211932Smm#endif 253185029Spjd ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || 254185029Spjd !refcount_is_zero(&rrl->rr_linked_rcount) || 255185029Spjd rrl->rr_writer != NULL); 256185029Spjd 257185029Spjd if (rrl->rr_writer == NULL) { 258211932Smm int64_t count; 259248571Smm if (rrn_find_and_remove(rrl, tag)) { 260211932Smm count = refcount_remove(&rrl->rr_linked_rcount, tag); 261248571Smm } else { 262248571Smm ASSERT(!rrl->rr_track_all); 263211932Smm count = refcount_remove(&rrl->rr_anon_rcount, tag); 264248571Smm } 265211932Smm if (count == 0) 266211932Smm cv_broadcast(&rrl->rr_cv); 267185029Spjd } else { 268185029Spjd ASSERT(rrl->rr_writer == curthread); 269185029Spjd ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && 270185029Spjd refcount_is_zero(&rrl->rr_linked_rcount)); 271185029Spjd rrl->rr_writer = NULL; 272185029Spjd cv_broadcast(&rrl->rr_cv); 273185029Spjd } 274185029Spjd mutex_exit(&rrl->rr_lock); 275185029Spjd} 276185029Spjd 277248571Smm/* 278248571Smm * If the lock was created with track_all, rrw_held(RW_READER) will return 279248571Smm * B_TRUE iff the current thread has the lock for reader. Otherwise it may 280248571Smm * return B_TRUE if any thread has the lock for reader. 281248571Smm */ 282185029Spjdboolean_t 283185029Spjdrrw_held(rrwlock_t *rrl, krw_t rw) 284185029Spjd{ 285185029Spjd boolean_t held; 286185029Spjd 287185029Spjd mutex_enter(&rrl->rr_lock); 288185029Spjd if (rw == RW_WRITER) { 289185029Spjd held = (rrl->rr_writer == curthread); 290185029Spjd } else { 291185029Spjd held = (!refcount_is_zero(&rrl->rr_anon_rcount) || 292248571Smm rrn_find(rrl) != NULL); 293185029Spjd } 294185029Spjd mutex_exit(&rrl->rr_lock); 295185029Spjd 296185029Spjd return (held); 297185029Spjd} 298248571Smm 299248571Smmvoid 300248571Smmrrw_tsd_destroy(void *arg) 301248571Smm{ 302248571Smm rrw_node_t *rn = arg; 303248571Smm if (rn != NULL) { 304248571Smm panic("thread %p terminating with rrw lock %p held", 305248571Smm (void *)curthread, (void *)rn->rn_rrl); 306248571Smm } 307248571Smm} 308268865Sdelphij 309268865Sdelphij/* 310268865Sdelphij * A reader-mostly lock implementation, tuning above reader-writer locks 311268865Sdelphij * for hightly parallel read acquisitions, while pessimizing writes. 312268865Sdelphij * 313268865Sdelphij * The idea is to split single busy lock into array of locks, so that 314268865Sdelphij * each reader can lock only one of them for read, depending on result 315268865Sdelphij * of simple hash function. That proportionally reduces lock congestion. 316268865Sdelphij * Writer same time has to sequentially aquire write on all the locks. 317268865Sdelphij * That makes write aquisition proportionally slower, but in places where 318268865Sdelphij * it is used (filesystem unmount) performance is not critical. 319268865Sdelphij * 320268865Sdelphij * All the functions below are direct wrappers around functions above. 321268865Sdelphij */ 322268865Sdelphijvoid 323268865Sdelphijrrm_init(rrmlock_t *rrl, boolean_t track_all) 324268865Sdelphij{ 325268865Sdelphij int i; 326268865Sdelphij 327268865Sdelphij for (i = 0; i < RRM_NUM_LOCKS; i++) 328268865Sdelphij rrw_init(&rrl->locks[i], track_all); 329268865Sdelphij} 330268865Sdelphij 331268865Sdelphijvoid 332268865Sdelphijrrm_destroy(rrmlock_t *rrl) 333268865Sdelphij{ 334268865Sdelphij int i; 335268865Sdelphij 336268865Sdelphij for (i = 0; i < RRM_NUM_LOCKS; i++) 337268865Sdelphij rrw_destroy(&rrl->locks[i]); 338268865Sdelphij} 339268865Sdelphij 340268865Sdelphijvoid 341268865Sdelphijrrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) 342268865Sdelphij{ 343268865Sdelphij if (rw == RW_READER) 344268865Sdelphij rrm_enter_read(rrl, tag); 345268865Sdelphij else 346268865Sdelphij rrm_enter_write(rrl); 347268865Sdelphij} 348268865Sdelphij 349268865Sdelphij/* 350268865Sdelphij * This maps the current thread to a specific lock. Note that the lock 351268865Sdelphij * must be released by the same thread that acquired it. We do this 352268865Sdelphij * mapping by taking the thread pointer mod a prime number. We examine 353268865Sdelphij * only the low 32 bits of the thread pointer, because 32-bit division 354268865Sdelphij * is faster than 64-bit division, and the high 32 bits have little 355268865Sdelphij * entropy anyway. 356268865Sdelphij */ 357268865Sdelphij#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) 358268865Sdelphij 359268865Sdelphijvoid 360268865Sdelphijrrm_enter_read(rrmlock_t *rrl, void *tag) 361268865Sdelphij{ 362268865Sdelphij rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); 363268865Sdelphij} 364268865Sdelphij 365268865Sdelphijvoid 366268865Sdelphijrrm_enter_write(rrmlock_t *rrl) 367268865Sdelphij{ 368268865Sdelphij int i; 369268865Sdelphij 370268865Sdelphij for (i = 0; i < RRM_NUM_LOCKS; i++) 371268865Sdelphij rrw_enter_write(&rrl->locks[i]); 372268865Sdelphij} 373268865Sdelphij 374268865Sdelphijvoid 375268865Sdelphijrrm_exit(rrmlock_t *rrl, void *tag) 376268865Sdelphij{ 377268865Sdelphij int i; 378268865Sdelphij 379268865Sdelphij if (rrl->locks[0].rr_writer == curthread) { 380268865Sdelphij for (i = 0; i < RRM_NUM_LOCKS; i++) 381268865Sdelphij rrw_exit(&rrl->locks[i], tag); 382268865Sdelphij } else { 383268865Sdelphij rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag); 384268865Sdelphij } 385268865Sdelphij} 386268865Sdelphij 387268865Sdelphijboolean_t 388268865Sdelphijrrm_held(rrmlock_t *rrl, krw_t rw) 389268865Sdelphij{ 390268865Sdelphij if (rw == RW_WRITER) { 391268865Sdelphij return (rrw_held(&rrl->locks[0], rw)); 392268865Sdelphij } else { 393268865Sdelphij return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw)); 394268865Sdelphij } 395268865Sdelphij} 396