1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 *	Keith Bostic.  All rights reserved.
9 */
10/*
11 * Copyright (c) 1990, 1993, 1994, 1995
12 *	The Regents of the University of California.  All rights reserved.
13 *
14 * This code is derived from software contributed to Berkeley by
15 * Mike Olson.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 * $Id: bt_search.c,v 12.40 2008/05/07 12:27:31 bschmeck Exp $
42 */
43
44#include "db_config.h"
45
46#include "db_int.h"
47#include "dbinc/db_page.h"
48#include "dbinc/btree.h"
49#include "dbinc/lock.h"
50#include "dbinc/mp.h"
51
52/*
53 * __bam_get_root --
54 *	Fetch the root of a tree and see if we want to keep
55 * it in the stack.
56 *
57 * PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *));
58 */
59int
60__bam_get_root(dbc, pg, slevel, flags, stack)
61	DBC *dbc;
62	db_pgno_t pg;
63	int slevel;
64	u_int32_t flags;
65	int *stack;
66{
67	BTREE_CURSOR *cp;
68	DB *dbp;
69	DB_LOCK lock;
70	DB_MPOOLFILE *mpf;
71	PAGE *h;
72	db_lockmode_t lock_mode;
73	int ret, t_ret;
74
75	dbp = dbc->dbp;
76	mpf = dbp->mpf;
77	cp = (BTREE_CURSOR *)dbc->internal;
78	/*
79	 * If write-locking pages, we need to know whether or not to acquire a
80	 * write lock on a page before getting it.  This depends on how deep it
81	 * is in tree, which we don't know until we acquire the root page.  So,
82	 * if we need to lock the root page we may have to upgrade it later,
83	 * because we won't get the correct lock initially.
84	 *
85	 * Retrieve the root page.
86	 */
87try_again:
88	*stack = LF_ISSET(SR_STACK) &&
89	      (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM));
90	lock_mode = DB_LOCK_READ;
91	if (*stack ||
92	    LF_ISSET(SR_DEL) || (LF_ISSET(SR_NEXT) && LF_ISSET(SR_WRITE)))
93		lock_mode = DB_LOCK_WRITE;
94	if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
95		return (ret);
96	if ((ret = __memp_fget(mpf, &pg,
97	     dbc->thread_info, dbc->txn, 0, &h)) != 0) {
98		/* Did not read it, so we can release the lock */
99		(void)__LPUT(dbc, lock);
100		return (ret);
101	}
102
103	/*
104	 * Decide if we need to save this page; if we do, write lock it.
105	 * We deliberately don't lock-couple on this call.  If the tree
106	 * is tiny, i.e., one page, and two threads are busily updating
107	 * the root page, we're almost guaranteed deadlocks galore, as
108	 * each one gets a read lock and then blocks the other's attempt
109	 * for a write lock.
110	 */
111	if (!*stack &&
112	    ((LF_ISSET(SR_PARENT) && (u_int8_t)(slevel + 1) >= LEVEL(h)) ||
113	    (LF_ISSET(SR_WRITE) && LEVEL(h) == LEAFLEVEL) ||
114	    (LF_ISSET(SR_START) && slevel == LEVEL(h)))) {
115		if (!STD_LOCKING(dbc))
116			goto no_relock;
117		ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
118		if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
119			ret = t_ret;
120		if (ret != 0)
121			return (ret);
122		lock_mode = DB_LOCK_WRITE;
123		if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
124			return (ret);
125		if ((ret = __memp_fget(mpf, &pg,
126		     dbc->thread_info, dbc->txn, 0, &h)) != 0) {
127			/* Did not read it, so we can release the lock */
128			(void)__LPUT(dbc, lock);
129			return (ret);
130		}
131		if (!((LF_ISSET(SR_PARENT) &&
132		    (u_int8_t)(slevel + 1) >= LEVEL(h)) ||
133		    (LF_ISSET(SR_WRITE) && LEVEL(h) == LEAFLEVEL) ||
134		    (LF_ISSET(SR_START) && slevel == LEVEL(h)))) {
135			/* Someone else split the root, start over. */
136			ret = __memp_fput(mpf,
137			    dbc->thread_info, h, dbc->priority);
138			if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
139				ret = t_ret;
140			if (ret != 0)
141				return (ret);
142			goto try_again;
143		}
144no_relock:	*stack = 1;
145	}
146	BT_STK_ENTER(dbp->env, cp, h, 0, lock, lock_mode, ret);
147
148	return (ret);
149}
150
151/*
152 * __bam_search --
153 *	Search a btree for a key.
154 *
155 * PUBLIC: int __bam_search __P((DBC *, db_pgno_t,
156 * PUBLIC:     const DBT *, u_int32_t, int, db_recno_t *, int *));
157 */
158int
159__bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp)
160	DBC *dbc;
161	db_pgno_t root_pgno;
162	const DBT *key;
163	u_int32_t flags;
164	int slevel, *exactp;
165	db_recno_t *recnop;
166{
167	BTREE *t;
168	BTREE_CURSOR *cp;
169	DB *dbp;
170	DB_LOCK lock;
171	DB_MPOOLFILE *mpf;
172	ENV *env;
173	PAGE *h;
174	db_indx_t base, i, indx, *inp, lim;
175	db_lockmode_t lock_mode;
176	db_pgno_t pg;
177	db_recno_t recno;
178	int adjust, cmp, deloffset, ret, set_stack, stack, t_ret;
179	int (*func) __P((DB *, const DBT *, const DBT *));
180
181	dbp = dbc->dbp;
182	env = dbp->env;
183	mpf = dbp->mpf;
184	cp = (BTREE_CURSOR *)dbc->internal;
185	h = NULL;
186	t = dbp->bt_internal;
187	recno = 0;
188	set_stack = 0;
189
190	BT_STK_CLR(cp);
191
192	/*
193	 * There are several ways we search a btree tree.  The flags argument
194	 * specifies if we're acquiring read or write locks, if we position
195	 * to the first or last item in a set of duplicates, if we return
196	 * deleted items, and if we are locking pairs of pages.  In addition,
197	 * if we're modifying record numbers, we have to lock the entire tree
198	 * regardless.  See btree.h for more details.
199	 */
200
201	if (root_pgno == PGNO_INVALID)
202		root_pgno = cp->root;
203	if ((ret = __bam_get_root(dbc, root_pgno, slevel, flags, &stack)) != 0)
204		return (ret);
205	lock_mode = cp->csp->lock_mode;
206	lock = cp->csp->lock;
207	h = cp->csp->page;
208
209	BT_STK_CLR(cp);
210
211	/* Choose a comparison function. */
212	func = F_ISSET(dbc, DBC_OPD) ?
213	    (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) :
214	    t->bt_compare;
215
216	for (;;) {
217		inp = P_INP(dbp, h);
218		adjust = TYPE(h) == P_LBTREE ? P_INDX : O_INDX;
219		if (LF_ISSET(SR_MIN | SR_MAX)) {
220			if (LF_ISSET(SR_MIN) || NUM_ENT(h) == 0)
221				indx = 0;
222			else if (TYPE(h) == P_LBTREE)
223				indx = NUM_ENT(h) - 2;
224			else
225				indx = NUM_ENT(h) - 1;
226
227			if (LEVEL(h) == LEAFLEVEL ||
228			     (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) {
229				if (LF_ISSET(SR_NEXT))
230					goto get_next;
231				goto found;
232			}
233			goto next;
234		}
235		/*
236		 * Do a binary search on the current page.  If we're searching
237		 * a Btree leaf page, we have to walk the indices in groups of
238		 * two.  If we're searching an internal page or a off-page dup
239		 * page, they're an index per page item.  If we find an exact
240		 * match on a leaf page, we're done.
241		 */
242		DB_BINARY_SEARCH_FOR(base, lim, h, adjust) {
243			DB_BINARY_SEARCH_INCR(indx, base, lim, adjust);
244			if ((ret = __bam_cmp(dbp, dbc->thread_info,
245			     dbc->txn, key, h, indx, func, &cmp)) != 0)
246				goto err;
247			if (cmp == 0) {
248				if (LEVEL(h) == LEAFLEVEL ||
249				    (!LF_ISSET(SR_START) &&
250				    LEVEL(h) == slevel)) {
251					if (LF_ISSET(SR_NEXT))
252						goto get_next;
253					goto found;
254				}
255				goto next;
256			}
257			if (cmp > 0)
258				DB_BINARY_SEARCH_SHIFT_BASE(indx, base,
259				    lim, adjust);
260		}
261
262		/*
263		 * No match found.  Base is the smallest index greater than
264		 * key and may be zero or a last + O_INDX index.
265		 *
266		 * If it's a leaf page or the stopping point,
267		 * return base as the "found" value.
268		 * Delete only deletes exact matches.
269		 */
270		if (LEVEL(h) == LEAFLEVEL ||
271		    (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) {
272			*exactp = 0;
273
274			if (LF_ISSET(SR_EXACT)) {
275				ret = DB_NOTFOUND;
276				goto err;
277			}
278
279			if (LF_ISSET(SR_STK_ONLY)) {
280				BT_STK_NUM(env, cp, h, base, ret);
281				if ((t_ret =
282				    __LPUT(dbc, lock)) != 0 && ret == 0)
283					ret = t_ret;
284				if ((t_ret = __memp_fput(mpf, dbc->thread_info,
285				     h, dbc->priority)) != 0 && ret == 0)
286					ret = t_ret;
287				return (ret);
288			}
289			if (LF_ISSET(SR_NEXT)) {
290get_next:			/*
291				 * The caller could have asked for a NEXT
292				 * at the root if the tree recently collapsed.
293				 */
294				if (PGNO(h) == root_pgno) {
295					ret = DB_NOTFOUND;
296					goto err;
297				}
298				/*
299				 * Save the root of the subtree
300				 * and drop the rest of the subtree
301				 * and search down again starting at
302				 * the next child.
303				 */
304				if ((ret = __LPUT(dbc, lock)) != 0)
305					goto err;
306				if ((ret = __memp_fput(mpf,
307				    dbc->thread_info, h, dbc->priority)) != 0)
308					goto err;
309				h = NULL;
310				LF_SET(SR_MIN);
311				LF_CLR(SR_NEXT);
312				indx = cp->sp->indx + 1;
313				if (indx == NUM_ENT(cp->sp->page)) {
314					ret = DB_NOTFOUND;
315					cp->csp++;
316					goto err;
317				}
318				h = cp->sp->page;
319				cp->sp->page = NULL;
320				lock = cp->sp->lock;
321				LOCK_INIT(cp->sp->lock);
322				if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
323					goto err;
324				stack = 1;
325				goto next;
326			}
327
328			/*
329			 * !!!
330			 * Possibly returning a deleted record -- DB_SET_RANGE,
331			 * DB_KEYFIRST and DB_KEYLAST don't require an exact
332			 * match, and we don't want to walk multiple pages here
333			 * to find an undeleted record.  This is handled by the
334			 * calling routine.
335			 */
336			if (LF_ISSET(SR_DEL) && cp->csp == cp->sp)
337				cp->csp++;
338			BT_STK_ENTER(env, cp, h, base, lock, lock_mode, ret);
339			if (ret != 0)
340				goto err;
341			return (0);
342		}
343
344		/*
345		 * If it's not a leaf page, record the internal page (which is
346		 * a parent page for the key).  Decrement the base by 1 if it's
347		 * non-zero so that if a split later occurs, the inserted page
348		 * will be to the right of the saved page.
349		 */
350		indx = base > 0 ? base - O_INDX : base;
351
352		/*
353		 * If we're trying to calculate the record number, sum up
354		 * all the record numbers on this page up to the indx point.
355		 */
356next:		if (recnop != NULL)
357			for (i = 0; i < indx; ++i)
358				recno += GET_BINTERNAL(dbp, h, i)->nrecs;
359
360		pg = GET_BINTERNAL(dbp, h, indx)->pgno;
361
362		/* See if we are at the level to start stacking. */
363		if (LF_ISSET(SR_START) && slevel == LEVEL(h))
364			stack = 1;
365
366		if (LF_ISSET(SR_STK_ONLY)) {
367			if (slevel == LEVEL(h)) {
368				BT_STK_NUM(env, cp, h, indx, ret);
369				if ((t_ret =
370				    __LPUT(dbc, lock)) != 0 && ret == 0)
371					ret = t_ret;
372				if ((t_ret = __memp_fput(mpf, dbc->thread_info,
373				    h, dbc->priority)) != 0 && ret == 0)
374					ret = t_ret;
375				return (ret);
376			}
377			BT_STK_NUMPUSH(env, cp, h, indx, ret);
378			(void)__memp_fput(mpf,
379			    dbc->thread_info, h, dbc->priority);
380			h = NULL;
381			if ((ret = __db_lget(dbc,
382			    LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
383				/*
384				 * Discard our lock and return on failure.  This
385				 * is OK because it only happens when descending
386				 * the tree holding read-locks.
387				 */
388				(void)__LPUT(dbc, lock);
389				return (ret);
390			}
391		} else if (stack) {
392			/* Return if this is the lowest page wanted. */
393			if (LF_ISSET(SR_PARENT) && slevel == LEVEL(h)) {
394				BT_STK_ENTER(env,
395				    cp, h, indx, lock, lock_mode, ret);
396				if (ret != 0)
397					goto err;
398				return (0);
399			}
400			if (LF_ISSET(SR_DEL) && NUM_ENT(h) > 1) {
401				/*
402				 * There was a page with a singleton pointer
403				 * to a non-empty subtree.
404				 */
405				cp->csp--;
406				if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
407					goto err;
408				set_stack = stack = 0;
409				goto do_del;
410			}
411			BT_STK_PUSH(env,
412			    cp, h, indx, lock, lock_mode, ret);
413			if (ret != 0)
414				goto err;
415			h = NULL;
416
417			lock_mode = DB_LOCK_WRITE;
418			if ((ret =
419			    __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
420				goto err;
421		} else {
422			/*
423			 * Decide if we want to return a reference to the next
424			 * page in the return stack.  If so, lock it and never
425			 * unlock it.  We will want to stack things on the
426			 * next iteration.  The stack variable cannot be
427			 * set until we leave this clause.
428			 */
429			if ((LF_ISSET(SR_PARENT) &&
430			    (u_int8_t)(slevel + 1) >= (LEVEL(h) - 1)) ||
431			    (LEVEL(h) - 1) == LEAFLEVEL)
432				set_stack = 1;
433
434			/*
435			 * Returning a subtree.  See if we have hit the start
436			 * point if so save the parent and set stack.
437			 * Otherwise free the parent and temporarily
438			 * save this one.
439			 * For SR_DEL we need to find a page with 1 entry.
440			 * For SR_NEXT we want find the minimal subtree
441			 * that contains the key and the next page.
442			 * We save pages as long as we are at the right
443			 * edge of the subtree.  When we leave the right
444			 * edge, then drop the subtree.
445			 */
446			if (!LF_ISSET(SR_DEL | SR_NEXT)) {
447				if ((ret = __memp_fput(mpf,
448				    dbc->thread_info, h, dbc->priority)) != 0)
449					goto err;
450				goto lock_next;
451			}
452
453			if ((LF_ISSET(SR_DEL) && NUM_ENT(h) == 1)) {
454				/*
455				 * We are pushing the things on the stack,
456				 * set the stack variable now to indicate this
457				 * has happened.
458				 */
459				stack = set_stack = 1;
460				LF_SET(SR_WRITE);
461				/* Push the parent. */
462				cp->csp++;
463				/* Push this node. */
464				BT_STK_PUSH(env, cp, h,
465				     indx, lock, lock_mode, ret);
466				if (ret != 0)
467					goto err;
468				LOCK_INIT(lock);
469			} else {
470			/*
471			 * See if we want to save the tree so far.
472			 * If we are looking for the next key,
473			 * then we must save this node if we are
474			 * at the end of the page.  If not then
475			 * discard anything we have saved so far.
476			 * For delete only keep one node until
477			 * we find a singleton.
478			 */
479do_del:				if (cp->csp->page != NULL) {
480					if (LF_ISSET(SR_NEXT) &&
481					     indx == NUM_ENT(h) - 1)
482						cp->csp++;
483					else if ((ret =
484					    __bam_stkrel(dbc, STK_NOLOCK)) != 0)
485						goto err;
486				}
487				/* Save this node. */
488				BT_STK_ENTER(env, cp,
489				    h, indx, lock, lock_mode, ret);
490				if (ret != 0)
491					goto err;
492				LOCK_INIT(lock);
493			}
494
495lock_next:		h = NULL;
496
497			if (set_stack && LF_ISSET(SR_WRITE))
498				lock_mode = DB_LOCK_WRITE;
499			if ((ret = __db_lget(dbc,
500			    LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
501				/*
502				 * If we fail, discard the lock we held.  This
503				 * is OK because this only happens when we are
504				 * descending the tree holding read-locks.
505				 */
506				(void)__LPUT(dbc, lock);
507				if (LF_ISSET(SR_DEL | SR_NEXT) && !stack)
508					cp->csp++;
509				goto err;
510			}
511			stack = set_stack;
512		}
513		if ((ret = __memp_fget(mpf, &pg,
514		     dbc->thread_info, dbc->txn, 0, &h)) != 0)
515			goto err;
516	}
517	/* NOTREACHED */
518
519found:	*exactp = 1;
520
521	/*
522	 * If we got here, we know that we have a Btree leaf or off-page
523	 * duplicates page.  If it's a Btree leaf page, we have to handle
524	 * on-page duplicates.
525	 *
526	 * If there are duplicates, go to the first/last one.  This is
527	 * safe because we know that we're not going to leave the page,
528	 * all duplicate sets that are not on overflow pages exist on a
529	 * single leaf page.
530	 */
531	if (TYPE(h) == P_LBTREE && NUM_ENT(h) > P_INDX) {
532		if (LF_ISSET(SR_DUPLAST))
533			while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
534			    inp[indx] == inp[indx + P_INDX])
535				indx += P_INDX;
536		else if (LF_ISSET(SR_DUPFIRST))
537			while (indx > 0 &&
538			    inp[indx] == inp[indx - P_INDX])
539				indx -= P_INDX;
540	}
541
542	/*
543	 * Now check if we are allowed to return deleted items; if not, then
544	 * find the next (or previous) non-deleted duplicate entry.  (We do
545	 * not move from the original found key on the basis of the SR_DELNO
546	 * flag.)
547	 */
548	DB_ASSERT(env, recnop == NULL || LF_ISSET(SR_DELNO));
549	if (LF_ISSET(SR_DELNO)) {
550		deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0;
551		if (LF_ISSET(SR_DUPLAST))
552			while (B_DISSET(GET_BKEYDATA(dbp,
553			    h, indx + deloffset)->type) && indx > 0 &&
554			    inp[indx] == inp[indx - adjust])
555				indx -= adjust;
556		else
557			while (B_DISSET(GET_BKEYDATA(dbp,
558			    h, indx + deloffset)->type) &&
559			    indx < (db_indx_t)(NUM_ENT(h) - adjust) &&
560			    inp[indx] == inp[indx + adjust])
561				indx += adjust;
562
563		/*
564		 * If we weren't able to find a non-deleted duplicate, return
565		 * DB_NOTFOUND.
566		 */
567		if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) {
568			ret = DB_NOTFOUND;
569			goto err;
570		}
571
572		/*
573		 * Increment the record counter to point to the found element.
574		 * Ignore any deleted key/data pairs.  There doesn't need to
575		 * be any correction for duplicates, as Btree doesn't support
576		 * duplicates and record numbers in the same tree.
577		 */
578		if (recnop != NULL) {
579			DB_ASSERT(env, TYPE(h) == P_LBTREE);
580
581			for (i = 0; i < indx; i += P_INDX)
582				if (!B_DISSET(
583				    GET_BKEYDATA(dbp, h, i + O_INDX)->type))
584					++recno;
585
586			/* Correct the number for a 0-base. */
587			*recnop = recno + 1;
588		}
589	}
590
591	if (LF_ISSET(SR_STK_ONLY)) {
592		BT_STK_NUM(env, cp, h, indx, ret);
593		if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
594			ret = t_ret;
595		if ((t_ret = __memp_fput(mpf,
596		     dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
597			ret = t_ret;
598	} else {
599		if (LF_ISSET(SR_DEL) && cp->csp == cp->sp)
600			cp->csp++;
601		BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
602	}
603	if (ret != 0)
604		goto err;
605
606	return (0);
607
608err:	if (h != NULL && (t_ret = __memp_fput(mpf,
609	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
610		ret = t_ret;
611
612	/* Keep any not-found page locked for serializability. */
613	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
614		ret = t_ret;
615
616	BT_STK_POP(cp);
617	__bam_stkrel(dbc, 0);
618
619	return (ret);
620}
621
622/*
623 * __bam_stkrel --
624 *	Release all pages currently held in the stack.
625 *
626 * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t));
627 */
628int
629__bam_stkrel(dbc, flags)
630	DBC *dbc;
631	u_int32_t flags;
632{
633	BTREE_CURSOR *cp;
634	DB *dbp;
635	DB_MPOOLFILE *mpf;
636	EPG *epg;
637	int ret, t_ret;
638
639	dbp = dbc->dbp;
640	mpf = dbp->mpf;
641	cp = (BTREE_CURSOR *)dbc->internal;
642
643	/*
644	 * Release inner pages first.
645	 *
646	 * The caller must be sure that setting STK_NOLOCK will not effect
647	 * either serializability or recoverability.
648	 */
649	for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) {
650		if (epg->page != NULL) {
651			if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) {
652				cp->page = NULL;
653				LOCK_INIT(cp->lock);
654			}
655			if ((t_ret = __memp_fput(mpf, dbc->thread_info,
656			     epg->page, dbc->priority)) != 0 && ret == 0)
657				ret = t_ret;
658			/*
659			 * XXX
660			 * Temporary fix for #3243 -- under certain deadlock
661			 * conditions we call here again and re-free the page.
662			 * The correct fix is to never release a stack that
663			 * doesn't hold items.
664			 */
665			epg->page = NULL;
666		}
667		/*
668		 * We set this if we need to release our pins,
669		 * but are not logically ready to have the pages
670		 * visible.
671		 */
672		if (LF_ISSET(STK_PGONLY))
673			continue;
674		if (LF_ISSET(STK_NOLOCK)) {
675			if ((t_ret = __LPUT(dbc, epg->lock)) != 0 && ret == 0)
676				ret = t_ret;
677		} else
678			if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
679				ret = t_ret;
680	}
681
682	/* Clear the stack, all pages have been released. */
683	if (!LF_ISSET(STK_PGONLY))
684		BT_STK_CLR(cp);
685
686	return (ret);
687}
688
689/*
690 * __bam_stkgrow --
691 *	Grow the stack.
692 *
693 * PUBLIC: int __bam_stkgrow __P((ENV *, BTREE_CURSOR *));
694 */
695int
696__bam_stkgrow(env, cp)
697	ENV *env;
698	BTREE_CURSOR *cp;
699{
700	EPG *p;
701	size_t entries;
702	int ret;
703
704	entries = cp->esp - cp->sp;
705
706	if ((ret = __os_calloc(env, entries * 2, sizeof(EPG), &p)) != 0)
707		return (ret);
708	memcpy(p, cp->sp, entries * sizeof(EPG));
709	if (cp->sp != cp->stack)
710		__os_free(env, cp->sp);
711	cp->sp = p;
712	cp->csp = p + entries;
713	cp->esp = p + entries * 2;
714	return (0);
715}
716