nfs4_db.c revision 9885:a3d5e9d9e779
1333347Speter/*
2333347Speter * CDDL HEADER START
3333347Speter *
4333347Speter * The contents of this file are subject to the terms of the
5333347Speter * Common Development and Distribution License (the "License").
6333347Speter * You may not use this file except in compliance with the License.
7333347Speter *
8333347Speter * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9333347Speter * or http://www.opensolaris.org/os/licensing.
10333347Speter * See the License for the specific language governing permissions
11333347Speter * and limitations under the License.
12333347Speter *
13333347Speter * When distributing Covered Code, include this CDDL HEADER in each
14333347Speter * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15333347Speter * If applicable, add the following below this CDDL HEADER, with the
16333347Speter * fields enclosed by brackets "[]" replaced with your own identifying
17333347Speter * information: Portions Copyright [yyyy] [name of copyright owner]
18333347Speter *
19333347Speter * CDDL HEADER END
20333347Speter */
21333347Speter/*
22333347Speter * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23333347Speter * Use is subject to license terms.
24333347Speter */
25333347Speter
26333347Speter#include <sys/systm.h>
27333347Speter#include <sys/cmn_err.h>
28333347Speter#include <sys/kmem.h>
29333347Speter#include <sys/disp.h>
30333347Speter#include <sys/id_space.h>
31333347Speter#include <sys/atomic.h>
32333347Speter#include <rpc/rpc.h>
33333347Speter#include <nfs/nfs4.h>
34333347Speter#include <nfs/nfs4_db_impl.h>
35333347Speter
36333347Speterstatic int rfs4_reap_interval = RFS4_REAP_INTERVAL;
37333347Speter
38333347Speterstatic void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
39333347Speterstatic void rfs4_dbe_destroy(rfs4_dbe_t *);
40333347Speterstatic rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
41333347Speterstatic void rfs4_start_reaper(rfs4_table_t *);
42333347Speter
43333347Speterid_t
44333347Speterrfs4_dbe_getid(rfs4_dbe_t *entry)
45333347Speter{
46333347Speter	return (entry->dbe_id);
47333347Speter}
48333347Speter
49333347Spetervoid
50333347Speterrfs4_dbe_hold(rfs4_dbe_t *entry)
51333347Speter{
52333347Speter	atomic_add_32(&entry->dbe_refcnt, 1);
53333347Speter}
54333347Speter
55333347Speter/*
56333347Speter * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
57333347Speter */
58333347Spetervoid
59333347Speterrfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
60333347Speter{
61333347Speter	atomic_add_32(&entry->dbe_refcnt, -1);
62333347Speter}
63333347Speter
64333347Speter
65333347Speteruint32_t
66333347Speterrfs4_dbe_refcnt(rfs4_dbe_t *entry)
67333347Speter{
68333347Speter	return (entry->dbe_refcnt);
69333347Speter}
70
71/*
72 * Mark an entry such that the dbsearch will skip it.
73 * Caller does not want this entry to be found any longer
74 */
75void
76rfs4_dbe_invalidate(rfs4_dbe_t *entry)
77{
78	entry->dbe_invalid = TRUE;
79	entry->dbe_skipsearch = TRUE;
80}
81
82/*
83 * Is this entry invalid?
84 */
85bool_t
86rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
87{
88	return (entry->dbe_invalid);
89}
90
91time_t
92rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
93{
94	return (entry->dbe_time_rele);
95}
96
97/*
98 * Use these to temporarily hide/unhide a db entry.
99 */
100void
101rfs4_dbe_hide(rfs4_dbe_t *entry)
102{
103	rfs4_dbe_lock(entry);
104	entry->dbe_skipsearch = TRUE;
105	rfs4_dbe_unlock(entry);
106}
107
108void
109rfs4_dbe_unhide(rfs4_dbe_t *entry)
110{
111	rfs4_dbe_lock(entry);
112	entry->dbe_skipsearch = FALSE;
113	rfs4_dbe_unlock(entry);
114}
115
116void
117rfs4_dbe_rele(rfs4_dbe_t *entry)
118{
119	mutex_enter(entry->dbe_lock);
120	ASSERT(entry->dbe_refcnt > 1);
121	atomic_add_32(&entry->dbe_refcnt, -1);
122	entry->dbe_time_rele = gethrestime_sec();
123	mutex_exit(entry->dbe_lock);
124}
125
126void
127rfs4_dbe_lock(rfs4_dbe_t *entry)
128{
129	mutex_enter(entry->dbe_lock);
130}
131
132void
133rfs4_dbe_unlock(rfs4_dbe_t *entry)
134{
135	mutex_exit(entry->dbe_lock);
136}
137
138bool_t
139rfs4_dbe_islocked(rfs4_dbe_t *entry)
140{
141	return (mutex_owned(entry->dbe_lock));
142}
143
144clock_t
145rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
146{
147	return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
148}
149
150void
151rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
152{
153	cv_broadcast(entry->dbe_cv);
154}
155
156/* ARGSUSED */
157static int
158rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
159{
160	rfs4_dbe_t *entry = obj;
161
162	mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
163	cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
164
165	return (0);
166}
167
168static void
169rfs4_dbe_kmem_destructor(void *obj, void *private)
170{
171	rfs4_dbe_t *entry = obj;
172	/*LINTED*/
173	rfs4_table_t *table = private;
174
175	mutex_destroy(entry->dbe_lock);
176	cv_destroy(entry->dbe_cv);
177}
178
179rfs4_database_t *
180rfs4_database_create(uint32_t flags)
181{
182	rfs4_database_t *db;
183
184	db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
185	mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
186	db->db_tables = NULL;
187	db->db_debug_flags = flags;
188	db->db_shutdown_count = 0;
189	cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
190	return (db);
191}
192
193
194/*
195 * The reaper threads that have been created for the tables in this
196 * database must be stopped and the entries in the tables released.
197 * Each table will be marked as "shutdown" and the reaper threads
198 * poked and they will see that a shutdown is in progress and cleanup
199 * and exit.  This function waits for all reaper threads to stop
200 * before returning to the caller.
201 */
202void
203rfs4_database_shutdown(rfs4_database_t *db)
204{
205	rfs4_table_t *table;
206
207	mutex_enter(db->db_lock);
208	for (table = db->db_tables; table; table = table->dbt_tnext) {
209		table->dbt_reaper_shutdown = TRUE;
210		mutex_enter(&table->dbt_reaper_cv_lock);
211		cv_broadcast(&table->dbt_reaper_wait);
212		db->db_shutdown_count++;
213		mutex_exit(&table->dbt_reaper_cv_lock);
214	}
215	while (db->db_shutdown_count > 0) {
216		cv_wait(&db->db_shutdown_wait, db->db_lock);
217	}
218	mutex_exit(db->db_lock);
219}
220
221/*
222 * Given a database that has been "shutdown" by the function above all
223 * of the table tables are destroyed and then the database itself
224 * freed.
225 */
226void
227rfs4_database_destroy(rfs4_database_t *db)
228{
229	rfs4_table_t *next, *tmp;
230
231	for (next = db->db_tables; next; ) {
232		tmp = next;
233		next = tmp->dbt_tnext;
234		rfs4_table_destroy(db, tmp);
235	}
236
237	mutex_destroy(db->db_lock);
238	kmem_free(db, sizeof (rfs4_database_t));
239}
240
241rfs4_table_t *
242rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
243    uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
244    void (*destroy)(rfs4_entry_t),
245    bool_t (*expiry)(rfs4_entry_t),
246    uint32_t size, uint32_t hashsize,
247    uint32_t maxentries, id_t start)
248{
249	rfs4_table_t *table;
250	int len;
251	char *cache_name;
252	char *id_name;
253
254	table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
255	table->dbt_db = db;
256	rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
257	mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
258	mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
259	cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
260
261	len = strlen(tabname);
262	table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
263	cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
264	(void) strcpy(table->dbt_name, tabname);
265	(void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
266	table->dbt_max_cache_time = max_cache_time;
267	table->dbt_usize = size;
268	table->dbt_len = hashsize;
269	table->dbt_count = 0;
270	table->dbt_idxcnt = 0;
271	table->dbt_ccnt = 0;
272	table->dbt_maxcnt = idxcnt;
273	table->dbt_indices = NULL;
274	table->dbt_id_space = NULL;
275	table->dbt_reaper_shutdown = FALSE;
276
277	if (start >= 0) {
278		if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
279			maxentries = INT32_MAX - start;
280		id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
281		(void) sprintf(id_name, "%s_id_space", table->dbt_name);
282		table->dbt_id_space = id_space_create(id_name, start,
283		    maxentries + start);
284		kmem_free(id_name, len + 10);
285	}
286	table->dbt_maxentries = maxentries;
287	table->dbt_create = create;
288	table->dbt_destroy = destroy;
289	table->dbt_expiry = expiry;
290
291	table->dbt_mem_cache = kmem_cache_create(cache_name,
292	    sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
293	    0,
294	    rfs4_dbe_kmem_constructor,
295	    rfs4_dbe_kmem_destructor,
296	    NULL,
297	    table,
298	    NULL,
299	    0);
300	kmem_free(cache_name, len+13);
301
302	table->dbt_debug = db->db_debug_flags;
303
304	mutex_enter(db->db_lock);
305	table->dbt_tnext = db->db_tables;
306	db->db_tables = table;
307	mutex_exit(db->db_lock);
308
309	rfs4_start_reaper(table);
310
311	return (table);
312}
313
314void
315rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
316{
317	rfs4_table_t *p;
318	rfs4_index_t *idx;
319
320	ASSERT(table->dbt_count == 0);
321
322	mutex_enter(db->db_lock);
323	if (table == db->db_tables)
324		db->db_tables = table->dbt_tnext;
325	else {
326		for (p = db->db_tables; p; p = p->dbt_tnext)
327			if (p->dbt_tnext == table) {
328				p->dbt_tnext = table->dbt_tnext;
329				table->dbt_tnext = NULL;
330				break;
331			}
332		ASSERT(p != NULL);
333	}
334	mutex_exit(db->db_lock);
335
336	/* Destroy indices */
337	while (table->dbt_indices) {
338		idx = table->dbt_indices;
339		table->dbt_indices = idx->dbi_inext;
340		rfs4_index_destroy(idx);
341	}
342
343	rw_destroy(table->dbt_t_lock);
344	mutex_destroy(table->dbt_lock);
345	mutex_destroy(&table->dbt_reaper_cv_lock);
346	cv_destroy(&table->dbt_reaper_wait);
347
348	kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
349	if (table->dbt_id_space)
350		id_space_destroy(table->dbt_id_space);
351	kmem_cache_destroy(table->dbt_mem_cache);
352	kmem_free(table, sizeof (rfs4_table_t));
353}
354
355rfs4_index_t *
356rfs4_index_create(rfs4_table_t *table, char *keyname,
357    uint32_t (*hash)(void *),
358    bool_t (compare)(rfs4_entry_t, void *),
359    void *(*mkkey)(rfs4_entry_t),
360    bool_t createable)
361{
362	rfs4_index_t *idx;
363
364	ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
365
366	idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
367
368	idx->dbi_table = table;
369	idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
370	(void) strcpy(idx->dbi_keyname, keyname);
371	idx->dbi_hash = hash;
372	idx->dbi_compare = compare;
373	idx->dbi_mkkey = mkkey;
374	idx->dbi_tblidx = table->dbt_idxcnt;
375	table->dbt_idxcnt++;
376	if (createable) {
377		table->dbt_ccnt++;
378		if (table->dbt_ccnt > 1)
379			panic("Table %s currently can have only have one "
380			    "index that will allow creation of entries",
381			    table->dbt_name);
382		idx->dbi_createable = TRUE;
383	} else {
384		idx->dbi_createable = FALSE;
385	}
386
387	idx->dbi_inext = table->dbt_indices;
388	table->dbt_indices = idx;
389	idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
390	    KM_SLEEP);
391
392	return (idx);
393}
394
395void
396rfs4_index_destroy(rfs4_index_t *idx)
397{
398	kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
399	kmem_free(idx->dbi_buckets,
400	    sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
401	kmem_free(idx, sizeof (rfs4_index_t));
402}
403
404static void
405rfs4_dbe_destroy(rfs4_dbe_t *entry)
406{
407	rfs4_index_t *idx;
408	void *key;
409	int i;
410	rfs4_bucket_t *bp;
411	rfs4_table_t *table = entry->dbe_table;
412	rfs4_link_t *l;
413
414	NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
415	    (CE_NOTE, "Destroying entry %p from %s",
416	    (void*)entry, table->dbt_name));
417
418	mutex_enter(entry->dbe_lock);
419	ASSERT(entry->dbe_refcnt == 0);
420	mutex_exit(entry->dbe_lock);
421
422	/* Unlink from all indices */
423	for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
424		l = &entry->dbe_indices[idx->dbi_tblidx];
425		/* check and see if we were ever linked in to the index */
426		if (INVALID_LINK(l)) {
427			ASSERT(l->next == NULL && l->prev == NULL);
428			continue;
429		}
430		key = idx->dbi_mkkey(entry->dbe_data);
431		i = HASH(idx, key);
432		bp = &idx->dbi_buckets[i];
433		ASSERT(bp->dbk_head != NULL);
434		DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
435	}
436
437	/* Destroy user data */
438	if (table->dbt_destroy)
439		(*table->dbt_destroy)(entry->dbe_data);
440
441	if (table->dbt_id_space)
442		id_free(table->dbt_id_space, entry->dbe_id);
443
444	mutex_enter(table->dbt_lock);
445	table->dbt_count--;
446	mutex_exit(table->dbt_lock);
447
448	/* Destroy the entry itself */
449	kmem_cache_free(table->dbt_mem_cache, entry);
450}
451
452
453static rfs4_dbe_t *
454rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
455{
456	rfs4_dbe_t *entry;
457	int i;
458
459	NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
460	    (CE_NOTE, "Creating entry in table %s", table->dbt_name));
461
462	entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
463
464	entry->dbe_refcnt = 1;
465	entry->dbe_invalid = FALSE;
466	entry->dbe_skipsearch = FALSE;
467	entry->dbe_time_rele = 0;
468	entry->dbe_id = 0;
469
470	if (table->dbt_id_space)
471		entry->dbe_id = id;
472	entry->dbe_table = table;
473
474	for (i = 0; i < table->dbt_maxcnt; i++) {
475		entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
476		entry->dbe_indices[i].entry = entry;
477		/*
478		 * We mark the entry as not indexed by setting the low
479		 * order bit, since address are word aligned. This has
480		 * the advantage of causeing a trap if the address is
481		 * used. After the entry is linked in to the
482		 * corresponding index the bit will be cleared.
483		 */
484		INVALIDATE_ADDR(entry->dbe_indices[i].entry);
485	}
486
487	entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
488	bzero(entry->dbe_data, table->dbt_usize);
489	entry->dbe_data->dbe = entry;
490
491	if (!(*table->dbt_create)(entry->dbe_data, data)) {
492		kmem_cache_free(table->dbt_mem_cache, entry);
493		return (NULL);
494	}
495
496	mutex_enter(table->dbt_lock);
497	table->dbt_count++;
498	mutex_exit(table->dbt_lock);
499
500	return (entry);
501}
502
503rfs4_entry_t
504rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
505    rfs4_dbsearch_type_t dbsearch_type)
506{
507	int already_done;
508	uint32_t i;
509	rfs4_table_t *table = idx->dbi_table;
510	rfs4_index_t *ip;
511	rfs4_bucket_t *bp;
512	rfs4_link_t *l;
513	rfs4_dbe_t *entry;
514	id_t id = -1;
515
516	i = HASH(idx, key);
517	bp = &idx->dbi_buckets[i];
518
519	NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
520	    (CE_NOTE, "Searching for key %p in table %s by %s",
521	    key, table->dbt_name, idx->dbi_keyname));
522
523	rw_enter(bp->dbk_lock, RW_READER);
524retry:
525	for (l = bp->dbk_head; l; l = l->next) {
526		if (l->entry->dbe_refcnt > 0 &&
527		    (l->entry->dbe_skipsearch == FALSE ||
528		    (l->entry->dbe_skipsearch == TRUE &&
529		    dbsearch_type == RFS4_DBS_INVALID)) &&
530		    (*idx->dbi_compare)(l->entry->dbe_data, key)) {
531			mutex_enter(l->entry->dbe_lock);
532			if (l->entry->dbe_refcnt == 0) {
533				mutex_exit(l->entry->dbe_lock);
534				continue;
535			}
536
537			/* place an additional hold since we are returning */
538			rfs4_dbe_hold(l->entry);
539
540			mutex_exit(l->entry->dbe_lock);
541			rw_exit(bp->dbk_lock);
542
543			*create = FALSE;
544
545			NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
546			    (CE_NOTE, "Found entry %p for %p in table %s",
547			    (void *)l->entry, key, table->dbt_name));
548
549			if (id != -1)
550				id_free(table->dbt_id_space, id);
551			return (l->entry->dbe_data);
552		}
553	}
554
555	if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
556	    table->dbt_maxentries == table->dbt_count) {
557		NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
558		    (CE_NOTE, "Entry for %p in %s not found",
559		    key, table->dbt_name));
560
561		rw_exit(bp->dbk_lock);
562		if (id != -1)
563			id_free(table->dbt_id_space, id);
564		return (NULL);
565	}
566
567	if (table->dbt_id_space && id == -1) {
568		/* get an id but don't sleep for it */
569		id = id_alloc_nosleep(table->dbt_id_space);
570		if (id == -1) {
571			rw_exit(bp->dbk_lock);
572
573			/* get an id, ok to sleep for it here */
574			id = id_alloc(table->dbt_id_space);
575
576			rw_enter(bp->dbk_lock, RW_WRITER);
577			goto retry;
578		}
579	}
580
581	/* get an exclusive lock on the bucket */
582	if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
583		NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
584		    (CE_NOTE, "Trying to upgrade lock on "
585		    "hash chain %d (%p) for  %s by %s",
586		    i, (void*)bp, table->dbt_name, idx->dbi_keyname));
587
588		rw_exit(bp->dbk_lock);
589		rw_enter(bp->dbk_lock, RW_WRITER);
590		goto retry;
591	}
592
593	/* create entry */
594	entry = rfs4_dbe_create(table, id, arg);
595	if (entry == NULL) {
596		rw_exit(bp->dbk_lock);
597		if (id != -1)
598			id_free(table->dbt_id_space, id);
599
600		NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
601		    (CE_NOTE, "Constructor for table %s failed",
602		    table->dbt_name));
603		return (NULL);
604	}
605
606	/*
607	 * Add one ref for entry into table's hash - only one
608	 * reference added even though there may be multiple indices
609	 */
610	rfs4_dbe_hold(entry);
611	ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
612	VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
613
614	already_done = idx->dbi_tblidx;
615	rw_exit(bp->dbk_lock);
616
617	for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
618		if (ip->dbi_tblidx == already_done)
619			continue;
620		l = &entry->dbe_indices[ip->dbi_tblidx];
621		i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
622		ASSERT(i < ip->dbi_table->dbt_len);
623		bp = &ip->dbi_buckets[i];
624		ENQUEUE_IDX(bp, l);
625	}
626
627	NFS4_DEBUG(
628	    table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
629	    (CE_NOTE, "Entry %p created for %s = %p in table %s",
630	    (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
631
632	return (entry->dbe_data);
633}
634
635/*ARGSUSED*/
636boolean_t
637rfs4_cpr_callb(void *arg, int code)
638{
639	rfs4_table_t *table = rfs4_client_tab;
640	rfs4_bucket_t *buckets, *bp;
641	rfs4_link_t *l;
642	rfs4_client_t *cp;
643	int i;
644
645	/*
646	 * We get called for Suspend and Resume events.
647	 * For the suspend case we simply don't care!  Nor do we care if
648	 * there are no clients.
649	 */
650	if (code == CB_CODE_CPR_CHKPT || table == NULL) {
651		return (B_TRUE);
652	}
653
654	buckets = table->dbt_indices->dbi_buckets;
655
656	/*
657	 * When we get this far we are in the process of
658	 * resuming the system from a previous suspend.
659	 *
660	 * We are going to blast through and update the
661	 * last_access time for all the clients and in
662	 * doing so extend them by one lease period.
663	 */
664	for (i = 0; i < table->dbt_len; i++) {
665		bp = &buckets[i];
666		for (l = bp->dbk_head; l; l = l->next) {
667			cp = (rfs4_client_t *)l->entry->dbe_data;
668			cp->rc_last_access = gethrestime_sec();
669		}
670	}
671
672	return (B_TRUE);
673}
674
675/*
676 * Given a table, lock each of the buckets and walk all entries (in
677 * turn locking those) and calling the provided "callout" function
678 * with the provided parameter.  Obviously used to iterate across all
679 * entries in a particular table via the database locking hierarchy.
680 * Obviously the caller must not hold locks on any of the entries in
681 * the specified table.
682 */
683void
684rfs4_dbe_walk(rfs4_table_t *table,
685    void (*callout)(rfs4_entry_t, void *),
686    void *data)
687{
688	rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
689	rfs4_link_t *l;
690	rfs4_dbe_t *entry;
691	int i;
692
693	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
694	    (CE_NOTE, "Walking entries in %s", table->dbt_name));
695
696	/* Walk the buckets looking for entries to release/destroy */
697	for (i = 0; i < table->dbt_len; i++) {
698		bp = &buckets[i];
699		rw_enter(bp->dbk_lock, RW_READER);
700		for (l = bp->dbk_head; l; l = l->next) {
701			entry = l->entry;
702			mutex_enter(entry->dbe_lock);
703			(*callout)(entry->dbe_data, data);
704			mutex_exit(entry->dbe_lock);
705		}
706		rw_exit(bp->dbk_lock);
707	}
708
709	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
710	    (CE_NOTE, "Walking entries complete %s", table->dbt_name));
711}
712
713
714static void
715rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
716{
717	rfs4_index_t *idx = table->dbt_indices;
718	rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
719	rfs4_link_t *l, *t;
720	rfs4_dbe_t *entry;
721	bool_t found;
722	int i;
723	int count = 0;
724
725	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
726	    (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
727	    desired, cache_time, table->dbt_name));
728
729	/* Walk the buckets looking for entries to release/destroy */
730	for (i = 0; i < table->dbt_len; i++) {
731		bp = &buckets[i];
732		do {
733			found = FALSE;
734			rw_enter(bp->dbk_lock, RW_READER);
735			for (l = bp->dbk_head; l; l = l->next) {
736				entry = l->entry;
737				/*
738				 * Examine an entry.  Ref count of 1 means
739				 * that the only reference is for the hash
740				 * table reference.
741				 */
742				if (entry->dbe_refcnt != 1)
743					continue;
744				mutex_enter(entry->dbe_lock);
745				if ((entry->dbe_refcnt == 1) &&
746				    (table->dbt_reaper_shutdown ||
747				    table->dbt_expiry == NULL ||
748				    (*table->dbt_expiry)(entry->dbe_data))) {
749					entry->dbe_refcnt--;
750					count++;
751					found = TRUE;
752				}
753				mutex_exit(entry->dbe_lock);
754			}
755			if (found) {
756				if (!rw_tryupgrade(bp->dbk_lock)) {
757					rw_exit(bp->dbk_lock);
758					rw_enter(bp->dbk_lock, RW_WRITER);
759				}
760
761				l = bp->dbk_head;
762				while (l) {
763					t = l;
764					entry = t->entry;
765					l = l->next;
766					if (entry->dbe_refcnt == 0) {
767						DEQUEUE(bp->dbk_head, t);
768						t->next = NULL;
769						t->prev = NULL;
770						INVALIDATE_ADDR(t->entry);
771						rfs4_dbe_destroy(entry);
772					}
773				}
774			}
775			rw_exit(bp->dbk_lock);
776			/*
777			 * delay slightly if there is more work to do
778			 * with the expectation that other reaper
779			 * threads are freeing data structures as well
780			 * and in turn will reduce ref counts on
781			 * entries in this table allowing them to be
782			 * released.  This is only done in the
783			 * instance that the tables are being shut down.
784			 */
785			if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
786				delay(hz/100);
787		/*
788		 * If this is a table shutdown, keep going until
789		 * everything is gone
790		 */
791		} while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
792
793		if (!table->dbt_reaper_shutdown && desired && count >= desired)
794			break;
795	}
796
797	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
798	    (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
799	    count, cache_time, table->dbt_name));
800}
801
802
803static void
804reaper_thread(caddr_t *arg)
805{
806	rfs4_table_t *table = (rfs4_table_t *)arg;
807	clock_t rc, time;
808
809	NFS4_DEBUG(table->dbt_debug,
810	    (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
811
812	CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
813	    callb_generic_cpr, "nfsv4Reaper");
814
815	time = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
816	mutex_enter(&table->dbt_reaper_cv_lock);
817	do {
818		CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
819		rc = cv_timedwait_sig(&table->dbt_reaper_wait,
820		    &table->dbt_reaper_cv_lock,
821		    lbolt + SEC_TO_TICK(time));
822		CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
823		    &table->dbt_reaper_cv_lock);
824		rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
825	} while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
826
827	CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
828
829	NFS4_DEBUG(table->dbt_debug,
830	    (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
831
832	/* Notify the database shutdown processing that the table is shutdown */
833	mutex_enter(table->dbt_db->db_lock);
834	table->dbt_db->db_shutdown_count--;
835	cv_signal(&table->dbt_db->db_shutdown_wait);
836	mutex_exit(table->dbt_db->db_lock);
837}
838
839static void
840rfs4_start_reaper(rfs4_table_t *table)
841{
842	if (table->dbt_max_cache_time == 0)
843		return;
844
845	(void) thread_create(NULL, 0, reaper_thread, table, 0, &p0, TS_RUN,
846	    minclsyspri);
847}
848
849#ifdef DEBUG
850void
851rfs4_dbe_debug(rfs4_dbe_t *entry)
852{
853	cmn_err(CE_NOTE, "Entry %p from table %s",
854	    (void *)entry, entry->dbe_table->dbt_name);
855	cmn_err(CE_CONT, "\trefcnt = %d id = %d",
856	    entry->dbe_refcnt, entry->dbe_id);
857}
858#endif
859