1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 *	Keith Bostic.  All rights reserved.
9 */
10/*
11 * Copyright (c) 1990, 1993
12 *	The Regents of the University of California.  All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * $Id: bt_rsearch.c,v 12.17 2008/01/08 20:57:59 bostic Exp $
39 */
40
41#include "db_config.h"
42
43#include "db_int.h"
44#include "dbinc/db_page.h"
45#include "dbinc/btree.h"
46#include "dbinc/lock.h"
47#include "dbinc/mp.h"
48
49/*
50 * __bam_rsearch --
51 *	Search a btree for a record number.
52 *
53 * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
54 */
55int
56__bam_rsearch(dbc, recnop, flags, stop, exactp)
57	DBC *dbc;
58	db_recno_t *recnop;
59	u_int32_t flags;
60	int stop, *exactp;
61{
62	BINTERNAL *bi;
63	BTREE_CURSOR *cp;
64	DB *dbp;
65	DB_LOCK lock;
66	DB_MPOOLFILE *mpf;
67	PAGE *h;
68	RINTERNAL *ri;
69	db_indx_t adjust, deloffset, indx, top;
70	db_lockmode_t lock_mode;
71	db_pgno_t pg;
72	db_recno_t recno, t_recno, total;
73	int ret, stack, t_ret;
74
75	dbp = dbc->dbp;
76	mpf = dbp->mpf;
77	cp = (BTREE_CURSOR *)dbc->internal;
78	h = NULL;
79
80	BT_STK_CLR(cp);
81
82	/*
83	 * There are several ways we search a btree tree.  The flags argument
84	 * specifies if we're acquiring read or write locks and if we are
85	 * locking pairs of pages.  In addition, if we're adding or deleting
86	 * an item, we have to lock the entire tree, regardless.  See btree.h
87	 * for more details.
88	 *
89	 * If write-locking pages, we need to know whether or not to acquire a
90	 * write lock on a page before getting it.  This depends on how deep it
91	 * is in tree, which we don't know until we acquire the root page.  So,
92	 * if we need to lock the root page we may have to upgrade it later,
93	 * because we won't get the correct lock initially.
94	 *
95	 * Retrieve the root page.
96	 */
97
98	if ((ret = __bam_get_root(dbc, cp->root, stop, flags, &stack)) != 0)
99		return (ret);
100	lock_mode = cp->csp->lock_mode;
101	lock = cp->csp->lock;
102	h = cp->csp->page;
103
104	BT_STK_CLR(cp);
105	/*
106	 * If appending to the tree, set the record number now -- we have the
107	 * root page locked.
108	 *
109	 * Delete only deletes exact matches, read only returns exact matches.
110	 * Note, this is different from __bam_search(), which returns non-exact
111	 * matches for read.
112	 *
113	 * The record may not exist.  We can only return the correct location
114	 * for the record immediately after the last record in the tree, so do
115	 * a fast check now.
116	 */
117	total = RE_NREC(h);
118	if (LF_ISSET(SR_APPEND)) {
119		*exactp = 0;
120		*recnop = recno = total + 1;
121	} else {
122		recno = *recnop;
123		if (recno <= total)
124			*exactp = 1;
125		else {
126			*exactp = 0;
127			if (!LF_ISSET(SR_PAST_EOF) || recno > total + 1) {
128				/*
129				 * Keep the page locked for serializability.
130				 *
131				 * XXX
132				 * This leaves the root page locked, which will
133				 * eliminate any concurrency.  A possible fix
134				 * would be to lock the last leaf page instead.
135				 */
136				ret = __memp_fput(mpf,
137				    dbc->thread_info, h, dbc->priority);
138				if ((t_ret =
139				    __TLPUT(dbc, lock)) != 0 && ret == 0)
140					ret = t_ret;
141				return (ret == 0 ? DB_NOTFOUND : ret);
142			}
143		}
144	}
145
146	/*
147	 * !!!
148	 * Record numbers in the tree are 0-based, but the recno is
149	 * 1-based.  All of the calculations below have to take this
150	 * into account.
151	 */
152	for (total = 0;;) {
153		switch (TYPE(h)) {
154		case P_LBTREE:
155		case P_LDUP:
156			recno -= total;
157			/*
158			 * There may be logically deleted records on the page.
159			 * If there are enough, the record may not exist.
160			 */
161			if (TYPE(h) == P_LBTREE) {
162				adjust = P_INDX;
163				deloffset = O_INDX;
164			} else {
165				adjust = O_INDX;
166				deloffset = 0;
167			}
168			for (t_recno = 0, indx = 0;; indx += adjust) {
169				if (indx >= NUM_ENT(h)) {
170					*exactp = 0;
171					if (!LF_ISSET(SR_PAST_EOF) ||
172					    recno > t_recno + 1) {
173						ret = __memp_fput(mpf,
174						    dbc->thread_info,
175						    h, dbc->priority);
176						h = NULL;
177						if ((t_ret = __TLPUT(dbc,
178						    lock)) != 0 && ret == 0)
179							ret = t_ret;
180						if (ret == 0)
181							ret = DB_NOTFOUND;
182						goto err;
183					}
184				}
185				if (!B_DISSET(GET_BKEYDATA(dbp, h,
186				    indx + deloffset)->type) &&
187				    ++t_recno == recno)
188					break;
189			}
190
191			/* Correct from 1-based to 0-based for a page offset. */
192			BT_STK_ENTER(dbp->env,
193			    cp, h, indx, lock, lock_mode, ret);
194			if (ret != 0)
195				goto err;
196			return (0);
197		case P_IBTREE:
198			for (indx = 0, top = NUM_ENT(h);;) {
199				bi = GET_BINTERNAL(dbp, h, indx);
200				if (++indx == top || total + bi->nrecs >= recno)
201					break;
202				total += bi->nrecs;
203			}
204			pg = bi->pgno;
205			break;
206		case P_LRECNO:
207			recno -= total;
208
209			/* Correct from 1-based to 0-based for a page offset. */
210			--recno;
211			BT_STK_ENTER(dbp->env,
212			    cp, h, recno, lock, lock_mode, ret);
213			if (ret != 0)
214				goto err;
215			return (0);
216		case P_IRECNO:
217			for (indx = 0, top = NUM_ENT(h);;) {
218				ri = GET_RINTERNAL(dbp, h, indx);
219				if (++indx == top || total + ri->nrecs >= recno)
220					break;
221				total += ri->nrecs;
222			}
223			pg = ri->pgno;
224			break;
225		default:
226			return (__db_pgfmt(dbp->env, h->pgno));
227		}
228		--indx;
229
230		/* Return if this is the lowest page wanted. */
231		if (stop == LEVEL(h)) {
232			BT_STK_ENTER(dbp->env,
233			    cp, h, indx, lock, lock_mode, ret);
234			if (ret != 0)
235				goto err;
236			return (0);
237		}
238		if (stack) {
239			BT_STK_PUSH(dbp->env,
240			    cp, h, indx, lock, lock_mode, ret);
241			if (ret != 0)
242				goto err;
243			h = NULL;
244
245			lock_mode = DB_LOCK_WRITE;
246			if ((ret =
247			    __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
248				goto err;
249		} else {
250			/*
251			 * Decide if we want to return a pointer to the next
252			 * page in the stack.  If we do, write lock it and
253			 * never unlock it.
254			 */
255			if ((LF_ISSET(SR_PARENT) &&
256			    (u_int8_t)(stop + 1) >= (u_int8_t)(LEVEL(h) - 1)) ||
257			    (LEVEL(h) - 1) == LEAFLEVEL)
258				stack = 1;
259
260			if ((ret = __memp_fput(mpf,
261			    dbc->thread_info, h, dbc->priority)) != 0)
262				goto err;
263			h = NULL;
264
265			lock_mode = stack &&
266			    LF_ISSET(SR_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
267			if ((ret = __db_lget(dbc,
268			    LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
269				/*
270				 * If we fail, discard the lock we held.  This
271				 * is OK because this only happens when we are
272				 * descending the tree holding read-locks.
273				 */
274				(void)__LPUT(dbc, lock);
275				goto err;
276			}
277		}
278
279		if ((ret = __memp_fget(mpf, &pg,
280		     dbc->thread_info, dbc->txn, 0, &h)) != 0)
281			goto err;
282	}
283	/* NOTREACHED */
284
285err:	if (h != NULL && (t_ret = __memp_fput(mpf,
286	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
287		ret = t_ret;
288
289	BT_STK_POP(cp);
290	__bam_stkrel(dbc, 0);
291
292	return (ret);
293}
294
295/*
296 * __bam_adjust --
297 *	Adjust the tree after adding or deleting a record.
298 *
299 * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
300 */
301int
302__bam_adjust(dbc, adjust)
303	DBC *dbc;
304	int32_t adjust;
305{
306	BTREE_CURSOR *cp;
307	DB *dbp;
308	DB_MPOOLFILE *mpf;
309	EPG *epg;
310	PAGE *h;
311	db_pgno_t root_pgno;
312	int ret;
313
314	dbp = dbc->dbp;
315	mpf = dbp->mpf;
316	cp = (BTREE_CURSOR *)dbc->internal;
317	root_pgno = cp->root;
318
319	/* Update the record counts for the tree. */
320	for (epg = cp->sp; epg <= cp->csp; ++epg) {
321		h = epg->page;
322		if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
323			if ((ret = __memp_dirty(mpf, &h,
324			    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
325				return (ret);
326			epg->page = h;
327			if (DBC_LOGGING(dbc)) {
328				if ((ret = __bam_cadjust_log(dbp, dbc->txn,
329				    &LSN(h), 0, PGNO(h), &LSN(h),
330				    (u_int32_t)epg->indx, adjust,
331				    PGNO(h) == root_pgno ?
332				    CAD_UPDATEROOT : 0)) != 0)
333					return (ret);
334			} else
335				LSN_NOT_LOGGED(LSN(h));
336
337			if (TYPE(h) == P_IBTREE)
338				GET_BINTERNAL(dbp, h, epg->indx)->nrecs +=
339				    adjust;
340			else
341				GET_RINTERNAL(dbp, h, epg->indx)->nrecs +=
342				    adjust;
343
344			if (PGNO(h) == root_pgno)
345				RE_NREC_ADJ(h, adjust);
346		}
347	}
348	return (0);
349}
350
351/*
352 * __bam_nrecs --
353 *	Return the number of records in the tree.
354 *
355 * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
356 */
357int
358__bam_nrecs(dbc, rep)
359	DBC *dbc;
360	db_recno_t *rep;
361{
362	DB *dbp;
363	DB_LOCK lock;
364	DB_MPOOLFILE *mpf;
365	PAGE *h;
366	db_pgno_t pgno;
367	int ret, t_ret;
368
369	dbp = dbc->dbp;
370	mpf = dbp->mpf;
371
372	pgno = dbc->internal->root;
373	if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
374		return (ret);
375	if ((ret = __memp_fget(mpf, &pgno,
376	     dbc->thread_info, dbc->txn, 0, &h)) != 0)
377		return (ret);
378
379	*rep = RE_NREC(h);
380
381	ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
382	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
383		ret = t_ret;
384
385	return (ret);
386}
387
388/*
389 * __bam_total --
390 *	Return the number of records below a page.
391 *
392 * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *));
393 */
394db_recno_t
395__bam_total(dbp, h)
396	DB *dbp;
397	PAGE *h;
398{
399	db_recno_t nrecs;
400	db_indx_t indx, top;
401
402	nrecs = 0;
403	top = NUM_ENT(h);
404
405	switch (TYPE(h)) {
406	case P_LBTREE:
407		/* Check for logically deleted records. */
408		for (indx = 0; indx < top; indx += P_INDX)
409			if (!B_DISSET(
410			    GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
411				++nrecs;
412		break;
413	case P_LDUP:
414		/* Check for logically deleted records. */
415		for (indx = 0; indx < top; indx += O_INDX)
416			if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
417				++nrecs;
418		break;
419	case P_IBTREE:
420		for (indx = 0; indx < top; indx += O_INDX)
421			nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs;
422		break;
423	case P_LRECNO:
424		nrecs = NUM_ENT(h);
425		break;
426	case P_IRECNO:
427		for (indx = 0; indx < top; indx += O_INDX)
428			nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs;
429		break;
430	}
431
432	return (nrecs);
433}
434