1/* cache.c - routines to maintain an in-core cache of entries */
2/* $OpenLDAP$ */
3/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4 *
5 * Copyright 2000-2011 The OpenLDAP Foundation.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted only as authorized by the OpenLDAP
10 * Public License.
11 *
12 * A copy of this license is available in the file LICENSE in the
13 * top-level directory of the distribution or, alternatively, at
14 * <http://www.OpenLDAP.org/license.html>.
15 */
16
17#include "portable.h"
18
19#include <stdio.h>
20
21#include <ac/errno.h>
22#include <ac/string.h>
23#include <ac/socket.h>
24
25#include "slap.h"
26
27#include "back-bdb.h"
28
29#include "ldap_rq.h"
30
31#ifdef BDB_HIER
32#define bdb_cache_lru_purge	hdb_cache_lru_purge
33#endif
34static void bdb_cache_lru_purge( struct bdb_info *bdb );
35
36static int	bdb_cache_delete_internal(Cache *cache, EntryInfo *e, int decr);
37#ifdef LDAP_DEBUG
38#define SLAPD_UNUSED
39#ifdef SLAPD_UNUSED
40static void	bdb_lru_print(Cache *cache);
41static void	bdb_idtree_print(Cache *cache);
42#endif
43#endif
44
45/* For concurrency experiments only! */
46#if 0
47#define	ldap_pvt_thread_rdwr_wlock(a)	0
48#define	ldap_pvt_thread_rdwr_wunlock(a)	0
49#define	ldap_pvt_thread_rdwr_rlock(a)	0
50#define	ldap_pvt_thread_rdwr_runlock(a)	0
51#endif
52
53#if 0
54#define ldap_pvt_thread_mutex_trylock(a) 0
55#endif
56
57static EntryInfo *
58bdb_cache_entryinfo_new( Cache *cache )
59{
60	EntryInfo *ei = NULL;
61
62	if ( cache->c_eifree ) {
63		ldap_pvt_thread_mutex_lock( &cache->c_eifree_mutex );
64		if ( cache->c_eifree ) {
65			ei = cache->c_eifree;
66			cache->c_eifree = ei->bei_lrunext;
67			ei->bei_finders = 0;
68			ei->bei_lrunext = NULL;
69		}
70		ldap_pvt_thread_mutex_unlock( &cache->c_eifree_mutex );
71	}
72	if ( !ei ) {
73		ei = ch_calloc(1, sizeof(EntryInfo));
74		ldap_pvt_thread_mutex_init( &ei->bei_kids_mutex );
75	}
76
77	ei->bei_state = CACHE_ENTRY_REFERENCED;
78
79	return ei;
80}
81
82static void
83bdb_cache_entryinfo_free( Cache *cache, EntryInfo *ei )
84{
85	free( ei->bei_nrdn.bv_val );
86	BER_BVZERO( &ei->bei_nrdn );
87#ifdef BDB_HIER
88	free( ei->bei_rdn.bv_val );
89	BER_BVZERO( &ei->bei_rdn );
90	ei->bei_modrdns = 0;
91	ei->bei_ckids = 0;
92	ei->bei_dkids = 0;
93#endif
94	ei->bei_parent = NULL;
95	ei->bei_kids = NULL;
96	ei->bei_lruprev = NULL;
97
98#if 0
99	ldap_pvt_thread_mutex_lock( &cache->c_eifree_mutex );
100	ei->bei_lrunext = cache->c_eifree;
101	cache->c_eifree = ei;
102	ldap_pvt_thread_mutex_unlock( &cache->c_eifree_mutex );
103#else
104	ldap_pvt_thread_mutex_destroy( &ei->bei_kids_mutex );
105	ch_free( ei );
106#endif
107}
108
109#define LRU_DEL( c, e ) do { \
110	if ( e == e->bei_lruprev ) { \
111		(c)->c_lruhead = (c)->c_lrutail = NULL; \
112	} else { \
113		if ( e == (c)->c_lruhead ) (c)->c_lruhead = e->bei_lruprev; \
114		if ( e == (c)->c_lrutail ) (c)->c_lrutail = e->bei_lruprev; \
115		e->bei_lrunext->bei_lruprev = e->bei_lruprev; \
116		e->bei_lruprev->bei_lrunext = e->bei_lrunext; \
117	} \
118	e->bei_lruprev = NULL; \
119} while ( 0 )
120
121/* Note - we now use a Second-Chance / Clock algorithm instead of
122 * Least-Recently-Used. This tremendously improves concurrency
123 * because we no longer need to manipulate the lists every time an
124 * entry is touched. We only need to lock the lists when adding
125 * or deleting an entry. It's now a circular doubly-linked list.
126 * We always append to the tail, but the head traverses the circle
127 * during a purge operation.
128 */
129static void
130bdb_cache_lru_link( struct bdb_info *bdb, EntryInfo *ei )
131{
132
133	/* Already linked, ignore */
134	if ( ei->bei_lruprev )
135		return;
136
137	/* Insert into circular LRU list */
138	ldap_pvt_thread_mutex_lock( &bdb->bi_cache.c_lru_mutex );
139
140	ei->bei_lruprev = bdb->bi_cache.c_lrutail;
141	if ( bdb->bi_cache.c_lrutail ) {
142		ei->bei_lrunext = bdb->bi_cache.c_lrutail->bei_lrunext;
143		bdb->bi_cache.c_lrutail->bei_lrunext = ei;
144		if ( ei->bei_lrunext )
145			ei->bei_lrunext->bei_lruprev = ei;
146	} else {
147		ei->bei_lrunext = ei->bei_lruprev = ei;
148		bdb->bi_cache.c_lruhead = ei;
149	}
150	bdb->bi_cache.c_lrutail = ei;
151	ldap_pvt_thread_mutex_unlock( &bdb->bi_cache.c_lru_mutex );
152}
153
154#ifdef NO_THREADS
155#define NO_DB_LOCK
156#endif
157
158/* #define NO_DB_LOCK 1 */
159/* Note: The BerkeleyDB locks are much slower than regular
160 * mutexes or rdwr locks. But the BDB implementation has the
161 * advantage of using a fixed size lock table, instead of
162 * allocating a lock object per entry in the DB. That's a
163 * key benefit for scaling. It also frees us from worrying
164 * about undetectable deadlocks between BDB activity and our
165 * own cache activity. It's still worth exploring faster
166 * alternatives though.
167 */
168
169/* Atomically release and reacquire a lock */
170int
171bdb_cache_entry_db_relock(
172	struct bdb_info *bdb,
173	DB_TXN *txn,
174	EntryInfo *ei,
175	int rw,
176	int tryOnly,
177	DB_LOCK *lock )
178{
179#ifdef NO_DB_LOCK
180	return 0;
181#else
182	int	rc;
183	DBT	lockobj;
184	DB_LOCKREQ list[2];
185
186	if ( !lock ) return 0;
187
188	DBTzero( &lockobj );
189	lockobj.data = &ei->bei_id;
190	lockobj.size = sizeof(ei->bei_id) + 1;
191
192	list[0].op = DB_LOCK_PUT;
193	list[0].lock = *lock;
194	list[1].op = DB_LOCK_GET;
195	list[1].lock = *lock;
196	list[1].mode = rw ? DB_LOCK_WRITE : DB_LOCK_READ;
197	list[1].obj = &lockobj;
198	rc = bdb->bi_dbenv->lock_vec(bdb->bi_dbenv, TXN_ID(txn), tryOnly ? DB_LOCK_NOWAIT : 0,
199		list, 2, NULL );
200
201	if (rc && !tryOnly) {
202		Debug( LDAP_DEBUG_TRACE,
203			"bdb_cache_entry_db_relock: entry %ld, rw %d, rc %d\n",
204			ei->bei_id, rw, rc );
205	} else {
206		*lock = list[1].lock;
207	}
208	return rc;
209#endif
210}
211
212static int
213bdb_cache_entry_db_lock( struct bdb_info *bdb, DB_TXN *txn, EntryInfo *ei,
214	int rw, int tryOnly, DB_LOCK *lock )
215{
216#ifdef NO_DB_LOCK
217	return 0;
218#else
219	int       rc;
220	DBT       lockobj;
221	int       db_rw;
222
223	if ( !lock ) return 0;
224
225	if (rw)
226		db_rw = DB_LOCK_WRITE;
227	else
228		db_rw = DB_LOCK_READ;
229
230	DBTzero( &lockobj );
231	lockobj.data = &ei->bei_id;
232	lockobj.size = sizeof(ei->bei_id) + 1;
233
234	rc = LOCK_GET(bdb->bi_dbenv, TXN_ID(txn), tryOnly ? DB_LOCK_NOWAIT : 0,
235					&lockobj, db_rw, lock);
236	if (rc && !tryOnly) {
237		Debug( LDAP_DEBUG_TRACE,
238			"bdb_cache_entry_db_lock: entry %ld, rw %d, rc %d\n",
239			ei->bei_id, rw, rc );
240	}
241	return rc;
242#endif /* NO_DB_LOCK */
243}
244
245int
246bdb_cache_entry_db_unlock ( struct bdb_info *bdb, DB_LOCK *lock )
247{
248#ifdef NO_DB_LOCK
249	return 0;
250#else
251	int rc;
252
253	if ( !lock || lock->mode == DB_LOCK_NG ) return 0;
254
255	rc = LOCK_PUT ( bdb->bi_dbenv, lock );
256	return rc;
257#endif
258}
259
260void
261bdb_cache_return_entry_rw( struct bdb_info *bdb, Entry *e,
262	int rw, DB_LOCK *lock )
263{
264	EntryInfo *ei;
265	int free = 0;
266
267	ei = e->e_private;
268	if ( ei && ( ei->bei_state & CACHE_ENTRY_NOT_CACHED )) {
269		bdb_cache_entryinfo_lock( ei );
270		if ( ei->bei_state & CACHE_ENTRY_NOT_CACHED ) {
271			/* Releasing the entry can only be done when
272			 * we know that nobody else is using it, i.e we
273			 * should have an entry_db writelock.  But the
274			 * flag is only set by the thread that loads the
275			 * entry, and only if no other threads has found
276			 * it while it was working.  All other threads
277			 * clear the flag, which mean that we should be
278			 * the only thread using the entry if the flag
279			 * is set here.
280			 */
281			ei->bei_e = NULL;
282			ei->bei_state ^= CACHE_ENTRY_NOT_CACHED;
283			free = 1;
284		}
285		bdb_cache_entryinfo_unlock( ei );
286	}
287	bdb_cache_entry_db_unlock( bdb, lock );
288	if ( free ) {
289		e->e_private = NULL;
290		bdb_entry_return( e );
291	}
292}
293
294static int
295bdb_cache_entryinfo_destroy( EntryInfo *e )
296{
297	ldap_pvt_thread_mutex_destroy( &e->bei_kids_mutex );
298	free( e->bei_nrdn.bv_val );
299#ifdef BDB_HIER
300	free( e->bei_rdn.bv_val );
301#endif
302	free( e );
303	return 0;
304}
305
306/* Do a length-ordered sort on normalized RDNs */
307static int
308bdb_rdn_cmp( const void *v_e1, const void *v_e2 )
309{
310	const EntryInfo *e1 = v_e1, *e2 = v_e2;
311	int rc = e1->bei_nrdn.bv_len - e2->bei_nrdn.bv_len;
312	if (rc == 0) {
313		rc = strncmp( e1->bei_nrdn.bv_val, e2->bei_nrdn.bv_val,
314			e1->bei_nrdn.bv_len );
315	}
316	return rc;
317}
318
319static int
320bdb_id_cmp( const void *v_e1, const void *v_e2 )
321{
322	const EntryInfo *e1 = v_e1, *e2 = v_e2;
323	return e1->bei_id - e2->bei_id;
324}
325
326static int
327bdb_id_dup_err( void *v1, void *v2 )
328{
329	EntryInfo *e2 = v2;
330	e2->bei_lrunext = v1;
331	return -1;
332}
333
334/* Create an entryinfo in the cache. Caller must release the locks later.
335 */
336static int
337bdb_entryinfo_add_internal(
338	struct bdb_info *bdb,
339	EntryInfo *ei,
340	EntryInfo **res )
341{
342	EntryInfo *ei2 = NULL;
343
344	*res = NULL;
345
346	ei2 = bdb_cache_entryinfo_new( &bdb->bi_cache );
347
348	bdb_cache_entryinfo_lock( ei->bei_parent );
349	ldap_pvt_thread_rdwr_wlock( &bdb->bi_cache.c_rwlock );
350
351	ei2->bei_id = ei->bei_id;
352	ei2->bei_parent = ei->bei_parent;
353#ifdef BDB_HIER
354	ei2->bei_rdn = ei->bei_rdn;
355#endif
356#ifdef SLAP_ZONE_ALLOC
357	ei2->bei_bdb = bdb;
358#endif
359
360	/* Add to cache ID tree */
361	if (avl_insert( &bdb->bi_cache.c_idtree, ei2, bdb_id_cmp,
362		bdb_id_dup_err )) {
363		EntryInfo *eix = ei2->bei_lrunext;
364		bdb_cache_entryinfo_free( &bdb->bi_cache, ei2 );
365		ei2 = eix;
366#ifdef BDB_HIER
367		/* It got freed above because its value was
368		 * assigned to ei2.
369		 */
370		ei->bei_rdn.bv_val = NULL;
371#endif
372	} else {
373		int rc;
374
375		bdb->bi_cache.c_eiused++;
376		ber_dupbv( &ei2->bei_nrdn, &ei->bei_nrdn );
377
378		/* This is a new leaf node. But if parent had no kids, then it was
379		 * a leaf and we would be decrementing that. So, only increment if
380		 * the parent already has kids.
381		 */
382		if ( ei->bei_parent->bei_kids || !ei->bei_parent->bei_id )
383			bdb->bi_cache.c_leaves++;
384		rc = avl_insert( &ei->bei_parent->bei_kids, ei2, bdb_rdn_cmp,
385			avl_dup_error );
386#ifdef BDB_HIER
387		/* it's possible for hdb_cache_find_parent to beat us to it */
388		if ( !rc ) {
389			ei->bei_parent->bei_ckids++;
390		}
391#endif
392	}
393
394	*res = ei2;
395	return 0;
396}
397
398/* Find the EntryInfo for the requested DN. If the DN cannot be found, return
399 * the info for its closest ancestor. *res should be NULL to process a
400 * complete DN starting from the tree root. Otherwise *res must be the
401 * immediate parent of the requested DN, and only the RDN will be searched.
402 * The EntryInfo is locked upon return and must be unlocked by the caller.
403 */
404int
405bdb_cache_find_ndn(
406	Operation	*op,
407	DB_TXN		*txn,
408	struct berval	*ndn,
409	EntryInfo	**res )
410{
411	struct bdb_info *bdb = (struct bdb_info *) op->o_bd->be_private;
412	EntryInfo	ei, *eip, *ei2;
413	int rc = 0;
414	char *ptr;
415
416	/* this function is always called with normalized DN */
417	if ( *res ) {
418		/* we're doing a onelevel search for an RDN */
419		ei.bei_nrdn.bv_val = ndn->bv_val;
420		ei.bei_nrdn.bv_len = dn_rdnlen( op->o_bd, ndn );
421		eip = *res;
422	} else {
423		/* we're searching a full DN from the root */
424		ptr = ndn->bv_val + ndn->bv_len - op->o_bd->be_nsuffix[0].bv_len;
425		ei.bei_nrdn.bv_val = ptr;
426		ei.bei_nrdn.bv_len = op->o_bd->be_nsuffix[0].bv_len;
427		/* Skip to next rdn if suffix is empty */
428		if ( ei.bei_nrdn.bv_len == 0 ) {
429			for (ptr = ei.bei_nrdn.bv_val - 2; ptr > ndn->bv_val
430				&& !DN_SEPARATOR(*ptr); ptr--) /* empty */;
431			if ( ptr >= ndn->bv_val ) {
432				if (DN_SEPARATOR(*ptr)) ptr++;
433				ei.bei_nrdn.bv_len = ei.bei_nrdn.bv_val - ptr;
434				ei.bei_nrdn.bv_val = ptr;
435			}
436		}
437		eip = &bdb->bi_cache.c_dntree;
438	}
439
440	for ( bdb_cache_entryinfo_lock( eip ); eip; ) {
441		eip->bei_state |= CACHE_ENTRY_REFERENCED;
442		ei.bei_parent = eip;
443		ei2 = (EntryInfo *)avl_find( eip->bei_kids, &ei, bdb_rdn_cmp );
444		if ( !ei2 ) {
445			DBC *cursor;
446			int len = ei.bei_nrdn.bv_len;
447
448			if ( BER_BVISEMPTY( ndn )) {
449				*res = eip;
450				return LDAP_SUCCESS;
451			}
452
453			ei.bei_nrdn.bv_len = ndn->bv_len -
454				(ei.bei_nrdn.bv_val - ndn->bv_val);
455			eip->bei_finders++;
456			bdb_cache_entryinfo_unlock( eip );
457
458			BDB_LOG_PRINTF( bdb->bi_dbenv, NULL, "slapd Reading %s",
459				ei.bei_nrdn.bv_val );
460
461			cursor = NULL;
462			rc = bdb_dn2id( op, &ei.bei_nrdn, &ei, txn, &cursor );
463			if (rc) {
464				bdb_cache_entryinfo_lock( eip );
465				eip->bei_finders--;
466				if ( cursor ) cursor->c_close( cursor );
467				*res = eip;
468				return rc;
469			}
470
471			BDB_LOG_PRINTF( bdb->bi_dbenv, NULL, "slapd Read got %s(%d)",
472				ei.bei_nrdn.bv_val, ei.bei_id );
473
474			/* DN exists but needs to be added to cache */
475			ei.bei_nrdn.bv_len = len;
476			rc = bdb_entryinfo_add_internal( bdb, &ei, &ei2 );
477			/* add_internal left eip and c_rwlock locked */
478			eip->bei_finders--;
479			ldap_pvt_thread_rdwr_wunlock( &bdb->bi_cache.c_rwlock );
480			if ( cursor ) cursor->c_close( cursor );
481			if ( rc ) {
482				*res = eip;
483				return rc;
484			}
485		}
486		bdb_cache_entryinfo_lock( ei2 );
487		if ( ei2->bei_state & CACHE_ENTRY_DELETED ) {
488			/* In the midst of deleting? Give it a chance to
489			 * complete.
490			 */
491			bdb_cache_entryinfo_unlock( ei2 );
492			bdb_cache_entryinfo_unlock( eip );
493			ldap_pvt_thread_yield();
494			bdb_cache_entryinfo_lock( eip );
495			*res = eip;
496			return DB_NOTFOUND;
497		}
498		bdb_cache_entryinfo_unlock( eip );
499
500		eip = ei2;
501
502		/* Advance to next lower RDN */
503		for (ptr = ei.bei_nrdn.bv_val - 2; ptr > ndn->bv_val
504			&& !DN_SEPARATOR(*ptr); ptr--) /* empty */;
505		if ( ptr >= ndn->bv_val ) {
506			if (DN_SEPARATOR(*ptr)) ptr++;
507			ei.bei_nrdn.bv_len = ei.bei_nrdn.bv_val - ptr - 1;
508			ei.bei_nrdn.bv_val = ptr;
509		}
510		if ( ptr < ndn->bv_val ) {
511			*res = eip;
512			break;
513		}
514	}
515
516	return rc;
517}
518
519#ifdef BDB_HIER
520/* Walk up the tree from a child node, looking for an ID that's already
521 * been linked into the cache.
522 */
523int
524hdb_cache_find_parent(
525	Operation *op,
526	DB_TXN	*txn,
527	ID id,
528	EntryInfo **res )
529{
530	struct bdb_info *bdb = (struct bdb_info *) op->o_bd->be_private;
531	EntryInfo ei, eip, *ei2 = NULL, *ein = NULL, *eir = NULL;
532	int rc, add;
533
534	ei.bei_id = id;
535	ei.bei_kids = NULL;
536	ei.bei_ckids = 0;
537
538	for (;;) {
539		rc = hdb_dn2id_parent( op, txn, &ei, &eip.bei_id );
540		if ( rc ) break;
541
542		/* Save the previous node, if any */
543		ei2 = ein;
544
545		/* Create a new node for the current ID */
546		ein = bdb_cache_entryinfo_new( &bdb->bi_cache );
547		ein->bei_id = ei.bei_id;
548		ein->bei_kids = ei.bei_kids;
549		ein->bei_nrdn = ei.bei_nrdn;
550		ein->bei_rdn = ei.bei_rdn;
551		ein->bei_ckids = ei.bei_ckids;
552#ifdef SLAP_ZONE_ALLOC
553		ein->bei_bdb = bdb;
554#endif
555		ei.bei_ckids = 0;
556		add = 1;
557
558		/* This node is not fully connected yet */
559		ein->bei_state |= CACHE_ENTRY_NOT_LINKED;
560
561		/* If this is the first time, save this node
562		 * to be returned later.
563		 */
564		if ( eir == NULL ) {
565			eir = ein;
566			ein->bei_finders++;
567		}
568
569again:
570		/* Insert this node into the ID tree */
571		ldap_pvt_thread_rdwr_wlock( &bdb->bi_cache.c_rwlock );
572		if ( avl_insert( &bdb->bi_cache.c_idtree, (caddr_t)ein,
573			bdb_id_cmp, bdb_id_dup_err ) ) {
574			EntryInfo *eix = ein->bei_lrunext;
575
576			if ( bdb_cache_entryinfo_trylock( eix )) {
577				ldap_pvt_thread_rdwr_wunlock( &bdb->bi_cache.c_rwlock );
578				ldap_pvt_thread_yield();
579				goto again;
580			}
581			ldap_pvt_thread_rdwr_wunlock( &bdb->bi_cache.c_rwlock );
582
583			/* Someone else created this node just before us.
584			 * Free our new copy and use the existing one.
585			 */
586			bdb_cache_entryinfo_free( &bdb->bi_cache, ein );
587
588			/* if it was the node we were looking for, just return it */
589			if ( eir == ein ) {
590				*res = eix;
591				rc = 0;
592				break;
593			}
594
595			ein = ei2;
596			ei2 = eix;
597			add = 0;
598
599			/* otherwise, link up what we have and return */
600			goto gotparent;
601		}
602
603		/* If there was a previous node, link it to this one */
604		if ( ei2 ) ei2->bei_parent = ein;
605
606		/* Look for this node's parent */
607par2:
608		if ( eip.bei_id ) {
609			ei2 = (EntryInfo *) avl_find( bdb->bi_cache.c_idtree,
610					(caddr_t) &eip, bdb_id_cmp );
611		} else {
612			ei2 = &bdb->bi_cache.c_dntree;
613		}
614		if ( ei2 && bdb_cache_entryinfo_trylock( ei2 )) {
615			ldap_pvt_thread_rdwr_wunlock( &bdb->bi_cache.c_rwlock );
616			ldap_pvt_thread_yield();
617			ldap_pvt_thread_rdwr_wlock( &bdb->bi_cache.c_rwlock );
618			goto par2;
619		}
620		if ( add )
621			bdb->bi_cache.c_eiused++;
622		if ( ei2 && ( ei2->bei_kids || !ei2->bei_id ))
623			bdb->bi_cache.c_leaves++;
624		ldap_pvt_thread_rdwr_wunlock( &bdb->bi_cache.c_rwlock );
625
626gotparent:
627		/* Got the parent, link in and we're done. */
628		if ( ei2 ) {
629			bdb_cache_entryinfo_lock( eir );
630			ein->bei_parent = ei2;
631
632			if ( avl_insert( &ei2->bei_kids, (caddr_t)ein, bdb_rdn_cmp,
633				avl_dup_error) == 0 )
634				ei2->bei_ckids++;
635
636			/* Reset all the state info */
637			for (ein = eir; ein != ei2; ein=ein->bei_parent)
638				ein->bei_state &= ~CACHE_ENTRY_NOT_LINKED;
639
640			bdb_cache_entryinfo_unlock( ei2 );
641			eir->bei_finders--;
642
643			*res = eir;
644			break;
645		}
646		ei.bei_kids = NULL;
647		ei.bei_id = eip.bei_id;
648		ei.bei_ckids = 1;
649		avl_insert( &ei.bei_kids, (caddr_t)ein, bdb_rdn_cmp,
650			avl_dup_error );
651	}
652	return rc;
653}
654
655/* Used by hdb_dn2idl when loading the EntryInfo for all the children
656 * of a given node
657 */
658int hdb_cache_load(
659	struct bdb_info *bdb,
660	EntryInfo *ei,
661	EntryInfo **res )
662{
663	EntryInfo *ei2;
664	int rc;
665
666	/* See if we already have this one */
667	bdb_cache_entryinfo_lock( ei->bei_parent );
668	ei2 = (EntryInfo *)avl_find( ei->bei_parent->bei_kids, ei, bdb_rdn_cmp );
669	bdb_cache_entryinfo_unlock( ei->bei_parent );
670
671	if ( !ei2 ) {
672		/* Not found, add it */
673		struct berval bv;
674
675		/* bei_rdn was not malloc'd before, do it now */
676		ber_dupbv( &bv, &ei->bei_rdn );
677		ei->bei_rdn = bv;
678
679		rc = bdb_entryinfo_add_internal( bdb, ei, res );
680		bdb_cache_entryinfo_unlock( ei->bei_parent );
681		ldap_pvt_thread_rdwr_wunlock( &bdb->bi_cache.c_rwlock );
682	} else {
683		/* Found, return it */
684		*res = ei2;
685		return 0;
686	}
687	return rc;
688}
689#endif
690
691/* This is best-effort only. If all entries in the cache are
692 * busy, they will all be kept. This is unlikely to happen
693 * unless the cache is very much smaller than the working set.
694 */
695static void
696bdb_cache_lru_purge( struct bdb_info *bdb )
697{
698	DB_LOCK		lock, *lockp;
699	EntryInfo *elru, *elnext = NULL;
700	int islocked;
701	ID eicount, ecount;
702	ID count, efree, eifree = 0;
703#ifdef LDAP_DEBUG
704	int iter;
705#endif
706
707	/* Wait for the mutex; we're the only one trying to purge. */
708	ldap_pvt_thread_mutex_lock( &bdb->bi_cache.c_lru_mutex );
709
710	if ( bdb->bi_cache.c_cursize > bdb->bi_cache.c_maxsize ) {
711		efree = bdb->bi_cache.c_cursize - bdb->bi_cache.c_maxsize;
712		efree += bdb->bi_cache.c_minfree;
713	} else {
714		efree = 0;
715	}
716
717	/* maximum number of EntryInfo leaves to cache. In slapcat
718	 * we always free all leaf nodes.
719	 */
720
721	if ( slapMode & SLAP_TOOL_READONLY ) {
722		eifree = bdb->bi_cache.c_leaves;
723	} else if ( bdb->bi_cache.c_eimax &&
724		bdb->bi_cache.c_leaves > bdb->bi_cache.c_eimax ) {
725		eifree = bdb->bi_cache.c_minfree * 10;
726		if ( eifree >= bdb->bi_cache.c_leaves )
727			eifree /= 2;
728	}
729
730	if ( !efree && !eifree ) {
731		ldap_pvt_thread_mutex_unlock( &bdb->bi_cache.c_lru_mutex );
732		bdb->bi_cache.c_purging = 0;
733		return;
734	}
735
736	if ( bdb->bi_cache.c_txn ) {
737		lockp = &lock;
738	} else {
739		lockp = NULL;
740	}
741
742	count = 0;
743	eicount = 0;
744	ecount = 0;
745#ifdef LDAP_DEBUG
746	iter = 0;
747#endif
748
749	/* Look for an unused entry to remove */
750	for ( elru = bdb->bi_cache.c_lruhead; elru; elru = elnext ) {
751		elnext = elru->bei_lrunext;
752
753		if ( bdb_cache_entryinfo_trylock( elru ))
754			goto bottom;
755
756		/* This flag implements the clock replacement behavior */
757		if ( elru->bei_state & ( CACHE_ENTRY_REFERENCED )) {
758			elru->bei_state &= ~CACHE_ENTRY_REFERENCED;
759			bdb_cache_entryinfo_unlock( elru );
760			goto bottom;
761		}
762
763		/* If this node is in the process of linking into the cache,
764		 * or this node is being deleted, skip it.
765		 */
766		if (( elru->bei_state & ( CACHE_ENTRY_NOT_LINKED |
767			CACHE_ENTRY_DELETED | CACHE_ENTRY_LOADING |
768			CACHE_ENTRY_ONELEVEL )) ||
769			elru->bei_finders > 0 ) {
770			bdb_cache_entryinfo_unlock( elru );
771			goto bottom;
772		}
773
774		if ( bdb_cache_entryinfo_trylock( elru->bei_parent )) {
775			bdb_cache_entryinfo_unlock( elru );
776			goto bottom;
777		}
778
779		/* entryinfo is locked */
780		islocked = 1;
781
782		/* If we can successfully writelock it, then
783		 * the object is idle.
784		 */
785		if ( bdb_cache_entry_db_lock( bdb,
786			bdb->bi_cache.c_txn, elru, 1, 1, lockp ) == 0 ) {
787
788			/* Free entry for this node if it's present */
789			if ( elru->bei_e ) {
790				ecount++;
791
792				/* the cache may have gone over the limit while we
793				 * weren't looking, so double check.
794				 */
795				if ( !efree && ecount > bdb->bi_cache.c_maxsize )
796					efree = bdb->bi_cache.c_minfree;
797
798				if ( count < efree ) {
799					elru->bei_e->e_private = NULL;
800#ifdef SLAP_ZONE_ALLOC
801					bdb_entry_return( bdb, elru->bei_e, elru->bei_zseq );
802#else
803					bdb_entry_return( elru->bei_e );
804#endif
805					elru->bei_e = NULL;
806					count++;
807				} else {
808					/* Keep this node cached, skip to next */
809					bdb_cache_entry_db_unlock( bdb, lockp );
810					goto next;
811				}
812			}
813			bdb_cache_entry_db_unlock( bdb, lockp );
814
815			/*
816			 * If it is a leaf node, and we're over the limit, free it.
817			 */
818			if ( elru->bei_kids ) {
819				/* Drop from list, we ignore it... */
820				LRU_DEL( &bdb->bi_cache, elru );
821			} else if ( eicount < eifree ) {
822				/* Too many leaf nodes, free this one */
823				bdb_cache_delete_internal( &bdb->bi_cache, elru, 0 );
824				bdb_cache_delete_cleanup( &bdb->bi_cache, elru );
825				islocked = 0;
826				eicount++;
827			}	/* Leave on list until we need to free it */
828		}
829
830next:
831		if ( islocked ) {
832			bdb_cache_entryinfo_unlock( elru );
833			bdb_cache_entryinfo_unlock( elru->bei_parent );
834		}
835
836		if ( count >= efree && eicount >= eifree )
837			break;
838bottom:
839		if ( elnext == bdb->bi_cache.c_lruhead )
840			break;
841#ifdef LDAP_DEBUG
842		iter++;
843#endif
844	}
845
846	if ( count || ecount > bdb->bi_cache.c_cursize ) {
847		ldap_pvt_thread_mutex_lock( &bdb->bi_cache.c_count_mutex );
848		/* HACK: we seem to be losing track, fix up now */
849		if ( ecount > bdb->bi_cache.c_cursize )
850			bdb->bi_cache.c_cursize = ecount;
851		bdb->bi_cache.c_cursize -= count;
852		ldap_pvt_thread_mutex_unlock( &bdb->bi_cache.c_count_mutex );
853	}
854	bdb->bi_cache.c_lruhead = elnext;
855	ldap_pvt_thread_mutex_unlock( &bdb->bi_cache.c_lru_mutex );
856	bdb->bi_cache.c_purging = 0;
857}
858
859/*
860 * cache_find_id - find an entry in the cache, given id.
861 * The entry is locked for Read upon return. Call with flag ID_LOCKED if
862 * the supplied *eip was already locked.
863 */
864
865int
866bdb_cache_find_id(
867	Operation *op,
868	DB_TXN	*tid,
869	ID				id,
870	EntryInfo	**eip,
871	int		flag,
872	DB_LOCK		*lock )
873{
874	struct bdb_info *bdb = (struct bdb_info *) op->o_bd->be_private;
875	Entry	*ep = NULL;
876	int	rc = 0, load = 0;
877	EntryInfo ei = { 0 };
878
879	ei.bei_id = id;
880
881#ifdef SLAP_ZONE_ALLOC
882	slap_zh_rlock(bdb->bi_cache.c_zctx);
883#endif
884	/* If we weren't given any info, see if we have it already cached */
885	if ( !*eip ) {
886again:	ldap_pvt_thread_rdwr_rlock( &bdb->bi_cache.c_rwlock );
887		*eip = (EntryInfo *) avl_find( bdb->bi_cache.c_idtree,
888			(caddr_t) &ei, bdb_id_cmp );
889		if ( *eip ) {
890			/* If the lock attempt fails, the info is in use */
891			if ( bdb_cache_entryinfo_trylock( *eip )) {
892				int del = (*eip)->bei_state & CACHE_ENTRY_DELETED;
893				ldap_pvt_thread_rdwr_runlock( &bdb->bi_cache.c_rwlock );
894				/* If this node is being deleted, treat
895				 * as if the delete has already finished
896				 */
897				if ( del ) {
898					return DB_NOTFOUND;
899				}
900				/* otherwise, wait for the info to free up */
901				ldap_pvt_thread_yield();
902				goto again;
903			}
904			/* If this info isn't hooked up to its parent yet,
905			 * unlock and wait for it to be fully initialized
906			 */
907			if ( (*eip)->bei_state & CACHE_ENTRY_NOT_LINKED ) {
908				bdb_cache_entryinfo_unlock( *eip );
909				ldap_pvt_thread_rdwr_runlock( &bdb->bi_cache.c_rwlock );
910				ldap_pvt_thread_yield();
911				goto again;
912			}
913			flag |= ID_LOCKED;
914		}
915		ldap_pvt_thread_rdwr_runlock( &bdb->bi_cache.c_rwlock );
916	}
917
918	/* See if the ID exists in the database; add it to the cache if so */
919	if ( !*eip ) {
920#ifndef BDB_HIER
921		rc = bdb_id2entry( op->o_bd, tid, id, &ep );
922		if ( rc == 0 ) {
923			rc = bdb_cache_find_ndn( op, tid,
924				&ep->e_nname, eip );
925			if ( *eip ) flag |= ID_LOCKED;
926			if ( rc ) {
927				ep->e_private = NULL;
928#ifdef SLAP_ZONE_ALLOC
929				bdb_entry_return( bdb, ep, (*eip)->bei_zseq );
930#else
931				bdb_entry_return( ep );
932#endif
933				ep = NULL;
934			}
935		}
936#else
937		rc = hdb_cache_find_parent(op, tid, id, eip );
938		if ( rc == 0 ) flag |= ID_LOCKED;
939#endif
940	}
941
942	/* Ok, we found the info, do we have the entry? */
943	if ( rc == 0 ) {
944		if ( !( flag & ID_LOCKED )) {
945			bdb_cache_entryinfo_lock( *eip );
946			flag |= ID_LOCKED;
947		}
948
949		if ( (*eip)->bei_state & CACHE_ENTRY_DELETED ) {
950			rc = DB_NOTFOUND;
951		} else {
952			(*eip)->bei_finders++;
953			(*eip)->bei_state |= CACHE_ENTRY_REFERENCED;
954			if ( flag & ID_NOENTRY ) {
955				bdb_cache_entryinfo_unlock( *eip );
956				return 0;
957			}
958			/* Make sure only one thread tries to load the entry */
959load1:
960#ifdef SLAP_ZONE_ALLOC
961			if ((*eip)->bei_e && !slap_zn_validate(
962					bdb->bi_cache.c_zctx, (*eip)->bei_e, (*eip)->bei_zseq)) {
963				(*eip)->bei_e = NULL;
964				(*eip)->bei_zseq = 0;
965			}
966#endif
967			if ( !(*eip)->bei_e && !((*eip)->bei_state & CACHE_ENTRY_LOADING)) {
968				load = 1;
969				(*eip)->bei_state |= CACHE_ENTRY_LOADING;
970				flag |= ID_CHKPURGE;
971			}
972
973			if ( !load ) {
974				/* Clear the uncached state if we are not
975				 * loading it, i.e it is already cached or
976				 * another thread is currently loading it.
977				 */
978				if ( (*eip)->bei_state & CACHE_ENTRY_NOT_CACHED ) {
979					(*eip)->bei_state ^= CACHE_ENTRY_NOT_CACHED;
980					flag |= ID_CHKPURGE;
981				}
982			}
983
984			if ( flag & ID_LOCKED ) {
985				bdb_cache_entryinfo_unlock( *eip );
986				flag ^= ID_LOCKED;
987			}
988			rc = bdb_cache_entry_db_lock( bdb, tid, *eip, load, 0, lock );
989			if ( (*eip)->bei_state & CACHE_ENTRY_DELETED ) {
990				rc = DB_NOTFOUND;
991				bdb_cache_entry_db_unlock( bdb, lock );
992				bdb_cache_entryinfo_lock( *eip );
993				(*eip)->bei_finders--;
994				bdb_cache_entryinfo_unlock( *eip );
995			} else if ( rc == 0 ) {
996				if ( load ) {
997					if ( !ep) {
998						rc = bdb_id2entry( op->o_bd, tid, id, &ep );
999					}
1000					if ( rc == 0 ) {
1001						ep->e_private = *eip;
1002#ifdef BDB_HIER
1003						while ( (*eip)->bei_state & CACHE_ENTRY_NOT_LINKED )
1004							ldap_pvt_thread_yield();
1005						bdb_fix_dn( ep, 0 );
1006#endif
1007						bdb_cache_entryinfo_lock( *eip );
1008
1009						(*eip)->bei_e = ep;
1010#ifdef SLAP_ZONE_ALLOC
1011						(*eip)->bei_zseq = *((ber_len_t *)ep - 2);
1012#endif
1013						ep = NULL;
1014						if ( flag & ID_NOCACHE ) {
1015							/* Set the cached state only if no other thread
1016							 * found the info while we were loading the entry.
1017							 */
1018							if ( (*eip)->bei_finders == 1 ) {
1019								(*eip)->bei_state |= CACHE_ENTRY_NOT_CACHED;
1020								flag ^= ID_CHKPURGE;
1021							}
1022						}
1023						bdb_cache_entryinfo_unlock( *eip );
1024						bdb_cache_lru_link( bdb, *eip );
1025					}
1026					if ( rc == 0 ) {
1027						/* If we succeeded, downgrade back to a readlock. */
1028						rc = bdb_cache_entry_db_relock( bdb, tid,
1029							*eip, 0, 0, lock );
1030					} else {
1031						/* Otherwise, release the lock. */
1032						bdb_cache_entry_db_unlock( bdb, lock );
1033					}
1034				} else if ( !(*eip)->bei_e ) {
1035					/* Some other thread is trying to load the entry,
1036					 * wait for it to finish.
1037					 */
1038					bdb_cache_entry_db_unlock( bdb, lock );
1039					bdb_cache_entryinfo_lock( *eip );
1040					flag |= ID_LOCKED;
1041					goto load1;
1042#ifdef BDB_HIER
1043				} else {
1044					/* Check for subtree renames
1045					 */
1046					rc = bdb_fix_dn( (*eip)->bei_e, 1 );
1047					if ( rc ) {
1048						bdb_cache_entry_db_relock( bdb,
1049							tid, *eip, 1, 0, lock );
1050						/* check again in case other modifier did it already */
1051						if ( bdb_fix_dn( (*eip)->bei_e, 1 ) )
1052							rc = bdb_fix_dn( (*eip)->bei_e, 2 );
1053						bdb_cache_entry_db_relock( bdb,
1054							tid, *eip, 0, 0, lock );
1055					}
1056#endif
1057				}
1058				bdb_cache_entryinfo_lock( *eip );
1059				(*eip)->bei_finders--;
1060				if ( load )
1061					(*eip)->bei_state ^= CACHE_ENTRY_LOADING;
1062				bdb_cache_entryinfo_unlock( *eip );
1063			}
1064		}
1065	}
1066	if ( flag & ID_LOCKED ) {
1067		bdb_cache_entryinfo_unlock( *eip );
1068	}
1069	if ( ep ) {
1070		ep->e_private = NULL;
1071#ifdef SLAP_ZONE_ALLOC
1072		bdb_entry_return( bdb, ep, (*eip)->bei_zseq );
1073#else
1074		bdb_entry_return( ep );
1075#endif
1076	}
1077	if ( rc == 0 ) {
1078		int purge = 0;
1079
1080		if (( flag & ID_CHKPURGE ) || bdb->bi_cache.c_eimax ) {
1081			ldap_pvt_thread_mutex_lock( &bdb->bi_cache.c_count_mutex );
1082			if ( flag & ID_CHKPURGE ) {
1083				bdb->bi_cache.c_cursize++;
1084				if ( !bdb->bi_cache.c_purging && bdb->bi_cache.c_cursize > bdb->bi_cache.c_maxsize ) {
1085					purge = 1;
1086					bdb->bi_cache.c_purging = 1;
1087				}
1088			} else if ( !bdb->bi_cache.c_purging && bdb->bi_cache.c_eimax && bdb->bi_cache.c_leaves > bdb->bi_cache.c_eimax ) {
1089				purge = 1;
1090				bdb->bi_cache.c_purging = 1;
1091			}
1092			ldap_pvt_thread_mutex_unlock( &bdb->bi_cache.c_count_mutex );
1093		}
1094		if ( purge )
1095			bdb_cache_lru_purge( bdb );
1096	}
1097
1098#ifdef SLAP_ZONE_ALLOC
1099	if (rc == 0 && (*eip)->bei_e) {
1100		slap_zn_rlock(bdb->bi_cache.c_zctx, (*eip)->bei_e);
1101	}
1102	slap_zh_runlock(bdb->bi_cache.c_zctx);
1103#endif
1104	return rc;
1105}
1106
1107int
1108bdb_cache_children(
1109	Operation *op,
1110	DB_TXN *txn,
1111	Entry *e )
1112{
1113	int rc;
1114
1115	if ( BEI(e)->bei_kids ) {
1116		return 0;
1117	}
1118	if ( BEI(e)->bei_state & CACHE_ENTRY_NO_KIDS ) {
1119		return DB_NOTFOUND;
1120	}
1121	rc = bdb_dn2id_children( op, txn, e );
1122	if ( rc == DB_NOTFOUND ) {
1123		BEI(e)->bei_state |= CACHE_ENTRY_NO_KIDS | CACHE_ENTRY_NO_GRANDKIDS;
1124	}
1125	return rc;
1126}
1127
1128/* Update the cache after a successful database Add. */
1129int
1130bdb_cache_add(
1131	struct bdb_info *bdb,
1132	EntryInfo *eip,
1133	Entry *e,
1134	struct berval *nrdn,
1135	DB_TXN *txn,
1136	DB_LOCK *lock )
1137{
1138	EntryInfo *new, ei;
1139	int rc, purge = 0;
1140#ifdef BDB_HIER
1141	struct berval rdn = e->e_name;
1142#endif
1143
1144	ei.bei_id = e->e_id;
1145	ei.bei_parent = eip;
1146	ei.bei_nrdn = *nrdn;
1147	ei.bei_lockpad = 0;
1148
1149#if 0
1150	/* Lock this entry so that bdb_add can run to completion.
1151	 * It can only fail if BDB has run out of lock resources.
1152	 */
1153	rc = bdb_cache_entry_db_lock( bdb, txn, &ei, 0, 0, lock );
1154	if ( rc ) {
1155		bdb_cache_entryinfo_unlock( eip );
1156		return rc;
1157	}
1158#endif
1159
1160#ifdef BDB_HIER
1161	if ( nrdn->bv_len != e->e_nname.bv_len ) {
1162		char *ptr = ber_bvchr( &rdn, ',' );
1163		assert( ptr != NULL );
1164		rdn.bv_len = ptr - rdn.bv_val;
1165	}
1166	ber_dupbv( &ei.bei_rdn, &rdn );
1167	if ( eip->bei_dkids ) eip->bei_dkids++;
1168#endif
1169
1170	if (eip->bei_parent) {
1171		bdb_cache_entryinfo_lock( eip->bei_parent );
1172		eip->bei_parent->bei_state &= ~CACHE_ENTRY_NO_GRANDKIDS;
1173		bdb_cache_entryinfo_unlock( eip->bei_parent );
1174	}
1175
1176	rc = bdb_entryinfo_add_internal( bdb, &ei, &new );
1177	/* bdb_csn_commit can cause this when adding the database root entry */
1178	if ( new->bei_e ) {
1179		new->bei_e->e_private = NULL;
1180#ifdef SLAP_ZONE_ALLOC
1181		bdb_entry_return( bdb, new->bei_e, new->bei_zseq );
1182#else
1183		bdb_entry_return( new->bei_e );
1184#endif
1185	}
1186	new->bei_e = e;
1187	e->e_private = new;
1188	new->bei_state |= CACHE_ENTRY_NO_KIDS | CACHE_ENTRY_NO_GRANDKIDS;
1189	eip->bei_state &= ~CACHE_ENTRY_NO_KIDS;
1190	bdb_cache_entryinfo_unlock( eip );
1191
1192	ldap_pvt_thread_rdwr_wunlock( &bdb->bi_cache.c_rwlock );
1193	ldap_pvt_thread_mutex_lock( &bdb->bi_cache.c_count_mutex );
1194	++bdb->bi_cache.c_cursize;
1195	if ( bdb->bi_cache.c_cursize > bdb->bi_cache.c_maxsize &&
1196		!bdb->bi_cache.c_purging ) {
1197		purge = 1;
1198		bdb->bi_cache.c_purging = 1;
1199	}
1200	ldap_pvt_thread_mutex_unlock( &bdb->bi_cache.c_count_mutex );
1201
1202	new->bei_finders = 1;
1203	bdb_cache_lru_link( bdb, new );
1204
1205	if ( purge )
1206		bdb_cache_lru_purge( bdb );
1207
1208	return rc;
1209}
1210
1211void bdb_cache_deref(
1212	EntryInfo *ei
1213	)
1214{
1215	bdb_cache_entryinfo_lock( ei );
1216	ei->bei_finders--;
1217	bdb_cache_entryinfo_unlock( ei );
1218}
1219
1220int
1221bdb_cache_modify(
1222	struct bdb_info *bdb,
1223	Entry *e,
1224	Attribute *newAttrs,
1225	DB_TXN *txn,
1226	DB_LOCK *lock )
1227{
1228	EntryInfo *ei = BEI(e);
1229	int rc;
1230	/* Get write lock on data */
1231	rc = bdb_cache_entry_db_relock( bdb, txn, ei, 1, 0, lock );
1232
1233	/* If we've done repeated mods on a cached entry, then e_attrs
1234	 * is no longer contiguous with the entry, and must be freed.
1235	 */
1236	if ( ! rc ) {
1237		if ( (void *)e->e_attrs != (void *)(e+1) ) {
1238			attrs_free( e->e_attrs );
1239		}
1240		e->e_attrs = newAttrs;
1241	}
1242	return rc;
1243}
1244
1245/*
1246 * Change the rdn in the entryinfo. Also move to a new parent if needed.
1247 */
1248int
1249bdb_cache_modrdn(
1250	struct bdb_info *bdb,
1251	Entry *e,
1252	struct berval *nrdn,
1253	Entry *new,
1254	EntryInfo *ein,
1255	DB_TXN *txn,
1256	DB_LOCK *lock )
1257{
1258	EntryInfo *ei = BEI(e), *pei;
1259	int rc;
1260#ifdef BDB_HIER
1261	struct berval rdn;
1262#endif
1263
1264	/* Get write lock on data */
1265	rc =  bdb_cache_entry_db_relock( bdb, txn, ei, 1, 0, lock );
1266	if ( rc ) return rc;
1267
1268	/* If we've done repeated mods on a cached entry, then e_attrs
1269	 * is no longer contiguous with the entry, and must be freed.
1270	 */
1271	if ( (void *)e->e_attrs != (void *)(e+1) ) {
1272		attrs_free( e->e_attrs );
1273	}
1274	e->e_attrs = new->e_attrs;
1275	if( e->e_nname.bv_val < e->e_bv.bv_val ||
1276		e->e_nname.bv_val > e->e_bv.bv_val + e->e_bv.bv_len )
1277	{
1278		ch_free(e->e_name.bv_val);
1279		ch_free(e->e_nname.bv_val);
1280	}
1281	e->e_name = new->e_name;
1282	e->e_nname = new->e_nname;
1283
1284	/* Lock the parent's kids AVL tree */
1285	pei = ei->bei_parent;
1286	bdb_cache_entryinfo_lock( pei );
1287	avl_delete( &pei->bei_kids, (caddr_t) ei, bdb_rdn_cmp );
1288	free( ei->bei_nrdn.bv_val );
1289	ber_dupbv( &ei->bei_nrdn, nrdn );
1290
1291#ifdef BDB_HIER
1292	free( ei->bei_rdn.bv_val );
1293
1294	rdn = e->e_name;
1295	if ( nrdn->bv_len != e->e_nname.bv_len ) {
1296		char *ptr = ber_bvchr(&rdn, ',');
1297		assert( ptr != NULL );
1298		rdn.bv_len = ptr - rdn.bv_val;
1299	}
1300	ber_dupbv( &ei->bei_rdn, &rdn );
1301
1302	/* If new parent, decrement kid counts */
1303	if ( ein ) {
1304		pei->bei_ckids--;
1305		if ( pei->bei_dkids ) {
1306			pei->bei_dkids--;
1307			if ( pei->bei_dkids < 2 )
1308				pei->bei_state |= CACHE_ENTRY_NO_KIDS | CACHE_ENTRY_NO_GRANDKIDS;
1309		}
1310	}
1311#endif
1312
1313	if (!ein) {
1314		ein = ei->bei_parent;
1315	} else {
1316		ei->bei_parent = ein;
1317		bdb_cache_entryinfo_unlock( pei );
1318		bdb_cache_entryinfo_lock( ein );
1319
1320		/* new parent now has kids */
1321		if ( ein->bei_state & CACHE_ENTRY_NO_KIDS )
1322			ein->bei_state ^= CACHE_ENTRY_NO_KIDS;
1323		/* grandparent has grandkids */
1324		if ( ein->bei_parent )
1325			ein->bei_parent->bei_state &= ~CACHE_ENTRY_NO_GRANDKIDS;
1326#ifdef BDB_HIER
1327		/* parent might now have grandkids */
1328		if ( ein->bei_state & CACHE_ENTRY_NO_GRANDKIDS &&
1329			!(ei->bei_state & CACHE_ENTRY_NO_KIDS))
1330			ein->bei_state ^= CACHE_ENTRY_NO_GRANDKIDS;
1331
1332		ein->bei_ckids++;
1333		if ( ein->bei_dkids ) ein->bei_dkids++;
1334#endif
1335	}
1336
1337#ifdef BDB_HIER
1338	/* Record the generation number of this change */
1339	ldap_pvt_thread_mutex_lock( &bdb->bi_modrdns_mutex );
1340	bdb->bi_modrdns++;
1341	ei->bei_modrdns = bdb->bi_modrdns;
1342	ldap_pvt_thread_mutex_unlock( &bdb->bi_modrdns_mutex );
1343#endif
1344
1345	avl_insert( &ein->bei_kids, ei, bdb_rdn_cmp, avl_dup_error );
1346	bdb_cache_entryinfo_unlock( ein );
1347	return rc;
1348}
1349/*
1350 * cache_delete - delete the entry e from the cache.
1351 *
1352 * returns:	0	e was deleted ok
1353 *		1	e was not in the cache
1354 *		-1	something bad happened
1355 */
1356int
1357bdb_cache_delete(
1358	struct bdb_info *bdb,
1359    Entry		*e,
1360    DB_TXN *txn,
1361    DB_LOCK	*lock )
1362{
1363	EntryInfo *ei = BEI(e);
1364	int	rc, busy = 0;
1365
1366	assert( e->e_private != NULL );
1367
1368	/* Lock the entry's info */
1369	bdb_cache_entryinfo_lock( ei );
1370
1371	/* Set this early, warn off any queriers */
1372	ei->bei_state |= CACHE_ENTRY_DELETED;
1373
1374	if (( ei->bei_state & ( CACHE_ENTRY_NOT_LINKED |
1375		CACHE_ENTRY_LOADING | CACHE_ENTRY_ONELEVEL )) ||
1376		ei->bei_finders > 0 )
1377		busy = 1;
1378
1379	bdb_cache_entryinfo_unlock( ei );
1380
1381	while ( busy ) {
1382		ldap_pvt_thread_yield();
1383		busy = 0;
1384		bdb_cache_entryinfo_lock( ei );
1385		if (( ei->bei_state & ( CACHE_ENTRY_NOT_LINKED |
1386			CACHE_ENTRY_LOADING | CACHE_ENTRY_ONELEVEL )) ||
1387			ei->bei_finders > 0 )
1388			busy = 1;
1389		bdb_cache_entryinfo_unlock( ei );
1390	}
1391
1392	/* Get write lock on the data */
1393	rc = bdb_cache_entry_db_relock( bdb, txn, ei, 1, 0, lock );
1394	if ( rc ) {
1395		bdb_cache_entryinfo_lock( ei );
1396		/* couldn't lock, undo and give up */
1397		ei->bei_state ^= CACHE_ENTRY_DELETED;
1398		bdb_cache_entryinfo_unlock( ei );
1399		return rc;
1400	}
1401
1402	Debug( LDAP_DEBUG_TRACE, "====> bdb_cache_delete( %ld )\n",
1403		e->e_id, 0, 0 );
1404
1405	/* set lru mutex */
1406	ldap_pvt_thread_mutex_lock( &bdb->bi_cache.c_lru_mutex );
1407
1408	bdb_cache_entryinfo_lock( ei->bei_parent );
1409	bdb_cache_entryinfo_lock( ei );
1410	rc = bdb_cache_delete_internal( &bdb->bi_cache, e->e_private, 1 );
1411	bdb_cache_entryinfo_unlock( ei );
1412
1413	/* free lru mutex */
1414	ldap_pvt_thread_mutex_unlock( &bdb->bi_cache.c_lru_mutex );
1415
1416	return( rc );
1417}
1418
1419void
1420bdb_cache_delete_cleanup(
1421	Cache *cache,
1422	EntryInfo *ei )
1423{
1424	/* Enter with ei locked */
1425
1426	/* already freed? */
1427	if ( !ei->bei_parent ) return;
1428
1429	if ( ei->bei_e ) {
1430		ei->bei_e->e_private = NULL;
1431#ifdef SLAP_ZONE_ALLOC
1432		bdb_entry_return( ei->bei_bdb, ei->bei_e, ei->bei_zseq );
1433#else
1434		bdb_entry_return( ei->bei_e );
1435#endif
1436		ei->bei_e = NULL;
1437	}
1438
1439	bdb_cache_entryinfo_unlock( ei );
1440	bdb_cache_entryinfo_free( cache, ei );
1441}
1442
1443static int
1444bdb_cache_delete_internal(
1445    Cache	*cache,
1446    EntryInfo		*e,
1447    int		decr )
1448{
1449	int rc = 0;	/* return code */
1450	int decr_leaf = 0;
1451
1452	/* already freed? */
1453	if ( !e->bei_parent ) {
1454		assert(0);
1455		return -1;
1456	}
1457
1458#ifdef BDB_HIER
1459	e->bei_parent->bei_ckids--;
1460	if ( decr && e->bei_parent->bei_dkids ) e->bei_parent->bei_dkids--;
1461#endif
1462	/* dn tree */
1463	if ( avl_delete( &e->bei_parent->bei_kids, (caddr_t) e, bdb_rdn_cmp )
1464		== NULL )
1465	{
1466		rc = -1;
1467		assert(0);
1468	}
1469	if ( e->bei_parent->bei_kids )
1470		decr_leaf = 1;
1471
1472	ldap_pvt_thread_rdwr_wlock( &cache->c_rwlock );
1473	/* id tree */
1474	if ( avl_delete( &cache->c_idtree, (caddr_t) e, bdb_id_cmp )) {
1475		cache->c_eiused--;
1476		if ( decr_leaf )
1477			cache->c_leaves--;
1478	} else {
1479		rc = -1;
1480		assert(0);
1481	}
1482	ldap_pvt_thread_rdwr_wunlock( &cache->c_rwlock );
1483	bdb_cache_entryinfo_unlock( e->bei_parent );
1484
1485	if ( rc == 0 ){
1486		/* lru */
1487		LRU_DEL( cache, e );
1488
1489		if ( e->bei_e ) {
1490			ldap_pvt_thread_mutex_lock( &cache->c_count_mutex );
1491			cache->c_cursize--;
1492			ldap_pvt_thread_mutex_unlock( &cache->c_count_mutex );
1493		}
1494	}
1495
1496	return( rc );
1497}
1498
1499static void
1500bdb_entryinfo_release( void *data )
1501{
1502	EntryInfo *ei = (EntryInfo *)data;
1503	if ( ei->bei_kids ) {
1504		avl_free( ei->bei_kids, NULL );
1505	}
1506	if ( ei->bei_e ) {
1507		ei->bei_e->e_private = NULL;
1508#ifdef SLAP_ZONE_ALLOC
1509		bdb_entry_return( ei->bei_bdb, ei->bei_e, ei->bei_zseq );
1510#else
1511		bdb_entry_return( ei->bei_e );
1512#endif
1513	}
1514	bdb_cache_entryinfo_destroy( ei );
1515}
1516
1517void
1518bdb_cache_release_all( Cache *cache )
1519{
1520	/* set cache write lock */
1521	ldap_pvt_thread_rdwr_wlock( &cache->c_rwlock );
1522	/* set lru mutex */
1523	ldap_pvt_thread_mutex_lock( &cache->c_lru_mutex );
1524
1525	Debug( LDAP_DEBUG_TRACE, "====> bdb_cache_release_all\n", 0, 0, 0 );
1526
1527	avl_free( cache->c_dntree.bei_kids, NULL );
1528	avl_free( cache->c_idtree, bdb_entryinfo_release );
1529	for (;cache->c_eifree;cache->c_eifree = cache->c_lruhead) {
1530		cache->c_lruhead = cache->c_eifree->bei_lrunext;
1531		bdb_cache_entryinfo_destroy(cache->c_eifree);
1532	}
1533	cache->c_cursize = 0;
1534	cache->c_eiused = 0;
1535	cache->c_leaves = 0;
1536	cache->c_idtree = NULL;
1537	cache->c_lruhead = NULL;
1538	cache->c_lrutail = NULL;
1539	cache->c_dntree.bei_kids = NULL;
1540
1541	/* free lru mutex */
1542	ldap_pvt_thread_mutex_unlock( &cache->c_lru_mutex );
1543	/* free cache write lock */
1544	ldap_pvt_thread_rdwr_wunlock( &cache->c_rwlock );
1545}
1546
1547#ifdef LDAP_DEBUG
1548static void
1549bdb_lru_count( Cache *cache )
1550{
1551	EntryInfo	*e;
1552	int ei = 0, ent = 0, nc = 0;
1553
1554	for ( e = cache->c_lrutail; ; ) {
1555		ei++;
1556		if ( e->bei_e ) {
1557			ent++;
1558			if ( e->bei_state & CACHE_ENTRY_NOT_CACHED )
1559				nc++;
1560			fprintf( stderr, "ei %d entry %p dn %s\n", ei, (void *) e->bei_e, e->bei_e->e_name.bv_val );
1561		}
1562		e = e->bei_lrunext;
1563		if ( e == cache->c_lrutail )
1564			break;
1565	}
1566	fprintf( stderr, "counted %d entryInfos and %d entries, %d notcached\n",
1567		ei, ent, nc );
1568	ei = 0;
1569	for ( e = cache->c_lrutail; ; ) {
1570		ei++;
1571		e = e->bei_lruprev;
1572		if ( e == cache->c_lrutail )
1573			break;
1574	}
1575	fprintf( stderr, "counted %d entryInfos (on lruprev)\n", ei );
1576}
1577
1578#ifdef SLAPD_UNUSED
1579static void
1580bdb_lru_print( Cache *cache )
1581{
1582	EntryInfo	*e;
1583
1584	fprintf( stderr, "LRU circle head: %p\n", (void *) cache->c_lruhead );
1585	fprintf( stderr, "LRU circle (tail forward):\n" );
1586	for ( e = cache->c_lrutail; ; ) {
1587		fprintf( stderr, "\t%p, %p id %ld rdn \"%s\"\n",
1588			(void *) e, (void *) e->bei_e, e->bei_id, e->bei_nrdn.bv_val );
1589		e = e->bei_lrunext;
1590		if ( e == cache->c_lrutail )
1591			break;
1592	}
1593	fprintf( stderr, "LRU circle (tail backward):\n" );
1594	for ( e = cache->c_lrutail; ; ) {
1595		fprintf( stderr, "\t%p, %p id %ld rdn \"%s\"\n",
1596			(void *) e, (void *) e->bei_e, e->bei_id, e->bei_nrdn.bv_val );
1597		e = e->bei_lruprev;
1598		if ( e == cache->c_lrutail )
1599			break;
1600	}
1601}
1602
1603static int
1604bdb_entryinfo_print(void *data, void *arg)
1605{
1606	EntryInfo *e = data;
1607	fprintf( stderr, "\t%p, %p id %ld rdn \"%s\"\n",
1608		(void *) e, (void *) e->bei_e, e->bei_id, e->bei_nrdn.bv_val );
1609	return 0;
1610}
1611
1612static void
1613bdb_idtree_print(Cache *cache)
1614{
1615	avl_apply( cache->c_idtree, bdb_entryinfo_print, NULL, -1, AVL_INORDER );
1616}
1617#endif
1618#endif
1619
1620static void
1621bdb_reader_free( void *key, void *data )
1622{
1623	/* DB_ENV *env = key; */
1624	DB_TXN *txn = data;
1625
1626	if ( txn ) TXN_ABORT( txn );
1627}
1628
1629/* free up any keys used by the main thread */
1630void
1631bdb_reader_flush( DB_ENV *env )
1632{
1633	void *data;
1634	void *ctx = ldap_pvt_thread_pool_context();
1635
1636	if ( !ldap_pvt_thread_pool_getkey( ctx, env, &data, NULL ) ) {
1637		ldap_pvt_thread_pool_setkey( ctx, env, NULL, 0, NULL, NULL );
1638		bdb_reader_free( env, data );
1639	}
1640}
1641
1642int
1643bdb_reader_get( Operation *op, DB_ENV *env, DB_TXN **txn )
1644{
1645	int i, rc;
1646	void *data;
1647	void *ctx;
1648
1649	if ( !env || !txn ) return -1;
1650
1651	/* If no op was provided, try to find the ctx anyway... */
1652	if ( op ) {
1653		ctx = op->o_threadctx;
1654	} else {
1655		ctx = ldap_pvt_thread_pool_context();
1656	}
1657
1658	/* Shouldn't happen unless we're single-threaded */
1659	if ( !ctx ) {
1660		*txn = NULL;
1661		return 0;
1662	}
1663
1664	if ( ldap_pvt_thread_pool_getkey( ctx, env, &data, NULL ) ) {
1665		for ( i=0, rc=1; rc != 0 && i<4; i++ ) {
1666			rc = TXN_BEGIN( env, NULL, txn, DB_READ_COMMITTED );
1667			if (rc) ldap_pvt_thread_yield();
1668		}
1669		if ( rc != 0) {
1670			return rc;
1671		}
1672		data = *txn;
1673		if ( ( rc = ldap_pvt_thread_pool_setkey( ctx, env,
1674			data, bdb_reader_free, NULL, NULL ) ) ) {
1675			TXN_ABORT( *txn );
1676			Debug( LDAP_DEBUG_ANY, "bdb_reader_get: err %s(%d)\n",
1677				db_strerror(rc), rc, 0 );
1678
1679			return rc;
1680		}
1681	} else {
1682		*txn = data;
1683	}
1684	return 0;
1685}
1686