1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 *	Keith Bostic.  All rights reserved.
9 */
10/*
11 * Copyright (c) 1990, 1993
12 *	The Regents of the University of California.  All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * $Id$
39 */
40
41#include "db_config.h"
42
43#include "db_int.h"
44#include "dbinc/db_page.h"
45#include "dbinc/btree.h"
46#include "dbinc/lock.h"
47#include "dbinc/mp.h"
48
49/*
50 * __bam_rsearch --
51 *	Search a btree for a record number.
52 *
53 * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
54 */
55int
56__bam_rsearch(dbc, recnop, flags, stop, exactp)
57	DBC *dbc;
58	db_recno_t *recnop;
59	u_int32_t flags;
60	int stop, *exactp;
61{
62	BINTERNAL *bi;
63	BTREE_CURSOR *cp;
64	DB *dbp;
65	DB_LOCK lock;
66	DB_MPOOLFILE *mpf;
67	ENV *env;
68	PAGE *h;
69	RINTERNAL *ri;
70	db_indx_t adjust, deloffset, indx, top;
71	db_lockmode_t lock_mode;
72	db_pgno_t pg;
73	db_recno_t recno, t_recno, total;
74	u_int32_t get_mode;
75	int ret, stack, t_ret;
76
77	dbp = dbc->dbp;
78	env = dbp->env;
79	mpf = dbp->mpf;
80	cp = (BTREE_CURSOR *)dbc->internal;
81	h = NULL;
82
83	BT_STK_CLR(cp);
84
85	/*
86	 * There are several ways we search a btree tree.  The flags argument
87	 * specifies if we're acquiring read or write locks and if we are
88	 * locking pairs of pages.  In addition, if we're adding or deleting
89	 * an item, we have to lock the entire tree, regardless.  See btree.h
90	 * for more details.
91	 *
92	 * If write-locking pages, we need to know whether or not to acquire a
93	 * write lock on a page before getting it.  This depends on how deep it
94	 * is in tree, which we don't know until we acquire the root page.  So,
95	 * if we need to lock the root page we may have to upgrade it later,
96	 * because we won't get the correct lock initially.
97	 *
98	 * Retrieve the root page.
99	 */
100
101	if ((ret = __bam_get_root(dbc, cp->root, stop, flags, &stack)) != 0)
102		return (ret);
103	lock_mode = cp->csp->lock_mode;
104	get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0;
105	lock = cp->csp->lock;
106	h = cp->csp->page;
107
108	BT_STK_CLR(cp);
109	/*
110	 * If appending to the tree, set the record number now -- we have the
111	 * root page locked.
112	 *
113	 * Delete only deletes exact matches, read only returns exact matches.
114	 * Note, this is different from __bam_search(), which returns non-exact
115	 * matches for read.
116	 *
117	 * The record may not exist.  We can only return the correct location
118	 * for the record immediately after the last record in the tree, so do
119	 * a fast check now.
120	 */
121	total = RE_NREC(h);
122	if (LF_ISSET(SR_APPEND)) {
123		*exactp = 0;
124		*recnop = recno = total + 1;
125	} else {
126		recno = *recnop;
127		if (recno <= total)
128			*exactp = 1;
129		else {
130			*exactp = 0;
131			if (!LF_ISSET(SR_PAST_EOF) || recno > total + 1) {
132				/*
133				 * Keep the page locked for serializability.
134				 *
135				 * XXX
136				 * This leaves the root page locked, which will
137				 * eliminate any concurrency.  A possible fix
138				 * would be to lock the last leaf page instead.
139				 */
140				ret = __memp_fput(mpf,
141				    dbc->thread_info, h, dbc->priority);
142				if ((t_ret =
143				    __TLPUT(dbc, lock)) != 0 && ret == 0)
144					ret = t_ret;
145				return (ret == 0 ? DB_NOTFOUND : ret);
146			}
147		}
148	}
149
150	/*
151	 * !!!
152	 * Record numbers in the tree are 0-based, but the recno is
153	 * 1-based.  All of the calculations below have to take this
154	 * into account.
155	 */
156	for (total = 0;;) {
157		switch (TYPE(h)) {
158		case P_LBTREE:
159			if (LF_ISSET(SR_MAX)) {
160				indx = NUM_ENT(h) - 2;
161				goto enter;
162			}
163			/* FALLTHROUGH */
164		case P_LDUP:
165			if (LF_ISSET(SR_MAX)) {
166				indx = NUM_ENT(h) - 1;
167				goto enter;
168			}
169			recno -= total;
170			/*
171			 * There may be logically deleted records on the page.
172			 * If there are enough, the record may not exist.
173			 */
174			if (TYPE(h) == P_LBTREE) {
175				adjust = P_INDX;
176				deloffset = O_INDX;
177			} else {
178				adjust = O_INDX;
179				deloffset = 0;
180			}
181			for (t_recno = 0, indx = 0;; indx += adjust) {
182				if (indx >= NUM_ENT(h)) {
183					*exactp = 0;
184					if (!LF_ISSET(SR_PAST_EOF) ||
185					    recno > t_recno + 1) {
186						ret = __memp_fput(mpf,
187						    dbc->thread_info,
188						    h, dbc->priority);
189						h = NULL;
190						if ((t_ret = __TLPUT(dbc,
191						    lock)) != 0 && ret == 0)
192							ret = t_ret;
193						if (ret == 0)
194							ret = DB_NOTFOUND;
195						goto err;
196					}
197				}
198				if (!B_DISSET(GET_BKEYDATA(dbp, h,
199				    indx + deloffset)->type) &&
200				    ++t_recno == recno)
201					break;
202			}
203
204			BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
205			if (ret != 0)
206				goto err;
207			if (LF_ISSET(SR_BOTH))
208				goto get_prev;
209			return (0);
210		case P_IBTREE:
211			if (LF_ISSET(SR_MAX)) {
212				indx = NUM_ENT(h);
213				bi = GET_BINTERNAL(dbp, h, indx - 1);
214			} else for (indx = 0, top = NUM_ENT(h);;) {
215				bi = GET_BINTERNAL(dbp, h, indx);
216				if (++indx == top || total + bi->nrecs >= recno)
217					break;
218				total += bi->nrecs;
219			}
220			pg = bi->pgno;
221			break;
222		case P_LRECNO:
223			if (LF_ISSET(SR_MAX))
224				recno = NUM_ENT(h);
225			else
226				recno -= total;
227
228			/* Correct from 1-based to 0-based for a page offset. */
229			--recno;
230enter:			BT_STK_ENTER(env, cp, h, recno, lock, lock_mode, ret);
231			if (ret != 0)
232				goto err;
233			if (LF_ISSET(SR_BOTH)) {
234get_prev:			DB_ASSERT(env, LF_ISSET(SR_NEXT));
235				/*
236				 * We have a NEXT tree, now add the sub tree
237				 * that points gets to the previous page.
238				 */
239				cp->csp++;
240				indx = cp->sp->indx - 1;
241				h = cp->sp->page;
242				if (TYPE(h) == P_IRECNO) {
243					ri = GET_RINTERNAL(dbp, h, indx);
244					pg = ri->pgno;
245				} else {
246					DB_ASSERT(env, TYPE(h) == P_IBTREE);
247					bi = GET_BINTERNAL(dbp, h, indx);
248					pg = bi->pgno;
249				}
250				LF_CLR(SR_NEXT | SR_BOTH);
251				LF_SET(SR_MAX);
252				stack = 1;
253				h = NULL;
254				goto lock_next;
255			}
256			return (0);
257		case P_IRECNO:
258			if (LF_ISSET(SR_MAX)) {
259				indx = NUM_ENT(h);
260				ri = GET_RINTERNAL(dbp, h, indx - 1);
261			} else for (indx = 0, top = NUM_ENT(h);;) {
262				ri = GET_RINTERNAL(dbp, h, indx);
263				if (++indx == top || total + ri->nrecs >= recno)
264					break;
265				total += ri->nrecs;
266			}
267			pg = ri->pgno;
268			break;
269		default:
270			return (__db_pgfmt(env, h->pgno));
271		}
272		--indx;
273
274		/* Return if this is the lowest page wanted. */
275		if (stop == LEVEL(h)) {
276			BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
277			if (ret != 0)
278				goto err;
279			return (0);
280		}
281		if (stack) {
282			BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
283			if (ret != 0)
284				goto err;
285			h = NULL;
286
287			lock_mode = DB_LOCK_WRITE;
288			get_mode = DB_MPOOL_DIRTY;
289			if ((ret =
290			    __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
291				goto err;
292		} else if (LF_ISSET(SR_NEXT)) {
293			/*
294			 * For RECNO if we are doing a NEXT search the
295			 * search recno is the one we are looking for
296			 * but we want to keep the stack from the spanning
297			 * node on down.  We only know we have the spanning
298			 * node when its child's index is 0, so save
299			 * each node and discard the tree when we find out
300			 * its not needed.
301			 */
302			if (indx != 0 && cp->sp->page != NULL) {
303				BT_STK_POP(cp);
304				if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
305					goto err;
306			}
307
308			BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
309			h = NULL;
310			if (ret != 0)
311				goto err;
312lock_next:		if ((ret =
313			    __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
314				goto err;
315		} else {
316			/*
317			 * Decide if we want to return a pointer to the next
318			 * page in the stack.  If we do, write lock it and
319			 * never unlock it.
320			 */
321			if ((LF_ISSET(SR_PARENT) &&
322			    (u_int8_t)(stop + 1) >= (u_int8_t)(LEVEL(h) - 1)) ||
323			    (LEVEL(h) - 1) == LEAFLEVEL)
324				stack = 1;
325
326			if ((ret = __memp_fput(mpf,
327			    dbc->thread_info, h, dbc->priority)) != 0)
328				goto err;
329			h = NULL;
330
331			lock_mode = stack &&
332			    LF_ISSET(SR_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
333			if (lock_mode == DB_LOCK_WRITE)
334				get_mode = DB_MPOOL_DIRTY;
335			if ((ret = __db_lget(dbc,
336			    LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
337				/*
338				 * If we fail, discard the lock we held.  This
339				 * is OK because this only happens when we are
340				 * descending the tree holding read-locks.
341				 */
342				(void)__LPUT(dbc, lock);
343				goto err;
344			}
345		}
346
347		if ((ret = __memp_fget(mpf, &pg,
348		     dbc->thread_info, dbc->txn, get_mode, &h)) != 0)
349			goto err;
350	}
351	/* NOTREACHED */
352
353err:	if (h != NULL && (t_ret = __memp_fput(mpf,
354	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
355		ret = t_ret;
356
357	BT_STK_POP(cp);
358	(void)__bam_stkrel(dbc, 0);
359
360	return (ret);
361}
362
363/*
364 * __bam_adjust --
365 *	Adjust the tree after adding or deleting a record.
366 *
367 * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
368 */
369int
370__bam_adjust(dbc, adjust)
371	DBC *dbc;
372	int32_t adjust;
373{
374	BTREE_CURSOR *cp;
375	DB *dbp;
376	DB_MPOOLFILE *mpf;
377	EPG *epg;
378	PAGE *h;
379	db_pgno_t root_pgno;
380	int ret;
381
382	dbp = dbc->dbp;
383	mpf = dbp->mpf;
384	cp = (BTREE_CURSOR *)dbc->internal;
385	root_pgno = cp->root;
386
387	/* Update the record counts for the tree. */
388	for (epg = cp->sp; epg <= cp->csp; ++epg) {
389		h = epg->page;
390		if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
391			ret = __memp_dirty(mpf, &h,
392			    dbc->thread_info, dbc->txn, dbc->priority, 0);
393			epg->page = h;
394			if (ret != 0)
395				return (ret);
396			if (DBC_LOGGING(dbc)) {
397				if ((ret = __bam_cadjust_log(dbp, dbc->txn,
398				    &LSN(h), 0, PGNO(h), &LSN(h),
399				    (u_int32_t)epg->indx, adjust,
400				    PGNO(h) == root_pgno ?
401				    CAD_UPDATEROOT : 0)) != 0)
402					return (ret);
403			} else
404				LSN_NOT_LOGGED(LSN(h));
405
406			if (TYPE(h) == P_IBTREE)
407				GET_BINTERNAL(dbp, h, epg->indx)->nrecs +=
408				    adjust;
409			else
410				GET_RINTERNAL(dbp, h, epg->indx)->nrecs +=
411				    adjust;
412
413			if (PGNO(h) == root_pgno)
414				RE_NREC_ADJ(h, adjust);
415		}
416	}
417	return (0);
418}
419
420/*
421 * __bam_nrecs --
422 *	Return the number of records in the tree.
423 *
424 * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
425 */
426int
427__bam_nrecs(dbc, rep)
428	DBC *dbc;
429	db_recno_t *rep;
430{
431	DB *dbp;
432	DB_LOCK lock;
433	DB_MPOOLFILE *mpf;
434	PAGE *h;
435	db_pgno_t pgno;
436	int ret, t_ret;
437
438	dbp = dbc->dbp;
439	mpf = dbp->mpf;
440
441	pgno = dbc->internal->root;
442	if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
443		return (ret);
444	if ((ret = __memp_fget(mpf, &pgno,
445	     dbc->thread_info, dbc->txn, 0, &h)) != 0)
446		return (ret);
447
448	*rep = RE_NREC(h);
449
450	ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
451	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
452		ret = t_ret;
453
454	return (ret);
455}
456
457/*
458 * __bam_total --
459 *	Return the number of records below a page.
460 *
461 * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *));
462 */
463db_recno_t
464__bam_total(dbp, h)
465	DB *dbp;
466	PAGE *h;
467{
468	db_recno_t nrecs;
469	db_indx_t indx, top;
470
471	nrecs = 0;
472	top = NUM_ENT(h);
473
474	switch (TYPE(h)) {
475	case P_LBTREE:
476		/* Check for logically deleted records. */
477		for (indx = 0; indx < top; indx += P_INDX)
478			if (!B_DISSET(
479			    GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
480				++nrecs;
481		break;
482	case P_LDUP:
483		/* Check for logically deleted records. */
484		for (indx = 0; indx < top; indx += O_INDX)
485			if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
486				++nrecs;
487		break;
488	case P_IBTREE:
489		for (indx = 0; indx < top; indx += O_INDX)
490			nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs;
491		break;
492	case P_LRECNO:
493		nrecs = NUM_ENT(h);
494		break;
495	case P_IRECNO:
496		for (indx = 0; indx < top; indx += O_INDX)
497			nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs;
498		break;
499	}
500
501	return (nrecs);
502}
503