1/*	$OpenBSD: bt_seq.c,v 1.12 2022/12/27 17:10:06 jmc Exp $	*/
2
3/*-
4 * Copyright (c) 1990, 1993, 1994
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Mike Olson.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35#include <sys/types.h>
36
37#include <errno.h>
38#include <stddef.h>
39#include <stdio.h>
40#include <stdlib.h>
41
42#include <db.h>
43#include "btree.h"
44
45static int __bt_first(BTREE *, const DBT *, EPG *, int *);
46static int __bt_seqadv(BTREE *, EPG *, int);
47static int __bt_seqset(BTREE *, EPG *, DBT *, int);
48
49/*
50 * Sequential scan support.
51 *
52 * The tree can be scanned sequentially, starting from either end of the
53 * tree or from any specific key.  A scan request before any scanning is
54 * done is initialized as starting from the least node.
55 */
56
57/*
58 * __bt_seq --
59 *	Btree sequential scan interface.
60 *
61 * Parameters:
62 *	dbp:	pointer to access method
63 *	key:	key for positioning and return value
64 *	data:	data return value
65 *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
66 *
67 * Returns:
68 *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
69 */
70int
71__bt_seq(const DB *dbp, DBT *key, DBT *data, u_int flags)
72{
73	BTREE *t;
74	EPG e;
75	int status;
76
77	t = dbp->internal;
78
79	/* Toss any page pinned across calls. */
80	if (t->bt_pinned != NULL) {
81		mpool_put(t->bt_mp, t->bt_pinned, 0);
82		t->bt_pinned = NULL;
83	}
84
85	/*
86	 * If scan uninitialized as yet, or starting at a specific record, set
87	 * the scan to a specific key.  Both __bt_seqset and __bt_seqadv pin
88	 * the page the cursor references if they're successful.
89	 */
90	switch (flags) {
91	case R_NEXT:
92	case R_PREV:
93		if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
94			status = __bt_seqadv(t, &e, flags);
95			break;
96		}
97		/* FALLTHROUGH */
98	case R_FIRST:
99	case R_LAST:
100	case R_CURSOR:
101		status = __bt_seqset(t, &e, key, flags);
102		break;
103	default:
104		errno = EINVAL;
105		return (RET_ERROR);
106	}
107
108	if (status == RET_SUCCESS) {
109		__bt_setcur(t, e.page->pgno, e.index);
110
111		status =
112		    __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
113
114		/*
115		 * If the user is doing concurrent access, we copied the
116		 * key/data, toss the page.
117		 */
118		if (F_ISSET(t, B_DB_LOCK))
119			mpool_put(t->bt_mp, e.page, 0);
120		else
121			t->bt_pinned = e.page;
122	}
123	return (status);
124}
125
126/*
127 * __bt_seqset --
128 *	Set the sequential scan to a specific key.
129 *
130 * Parameters:
131 *	t:	tree
132 *	ep:	storage for returned key
133 *	key:	key for initial scan position
134 *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
135 *
136 * Side effects:
137 *	Pins the page the cursor references.
138 *
139 * Returns:
140 *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
141 */
142static int
143__bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags)
144{
145	PAGE *h;
146	pgno_t pg;
147	int exact;
148
149	/*
150	 * Find the first, last or specific key in the tree and point the
151	 * cursor at it.  The cursor may not be moved until a new key has
152	 * been found.
153	 */
154	switch (flags) {
155	case R_CURSOR:				/* Keyed scan. */
156		/*
157		 * Find the first instance of the key or the smallest key
158		 * which is greater than or equal to the specified key.
159		 */
160		if (key->data == NULL || key->size == 0) {
161			errno = EINVAL;
162			return (RET_ERROR);
163		}
164		return (__bt_first(t, key, ep, &exact));
165	case R_FIRST:				/* First record. */
166	case R_NEXT:
167		/* Walk down the left-hand side of the tree. */
168		for (pg = P_ROOT;;) {
169			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
170				return (RET_ERROR);
171
172			/* Check for an empty tree. */
173			if (NEXTINDEX(h) == 0) {
174				mpool_put(t->bt_mp, h, 0);
175				return (RET_SPECIAL);
176			}
177
178			if (h->flags & (P_BLEAF | P_RLEAF))
179				break;
180			pg = GETBINTERNAL(h, 0)->pgno;
181			mpool_put(t->bt_mp, h, 0);
182		}
183		ep->page = h;
184		ep->index = 0;
185		break;
186	case R_LAST:				/* Last record. */
187	case R_PREV:
188		/* Walk down the right-hand side of the tree. */
189		for (pg = P_ROOT;;) {
190			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
191				return (RET_ERROR);
192
193			/* Check for an empty tree. */
194			if (NEXTINDEX(h) == 0) {
195				mpool_put(t->bt_mp, h, 0);
196				return (RET_SPECIAL);
197			}
198
199			if (h->flags & (P_BLEAF | P_RLEAF))
200				break;
201			pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
202			mpool_put(t->bt_mp, h, 0);
203		}
204
205		ep->page = h;
206		ep->index = NEXTINDEX(h) - 1;
207		break;
208	}
209	return (RET_SUCCESS);
210}
211
212/*
213 * __bt_seqadvance --
214 *	Advance the sequential scan.
215 *
216 * Parameters:
217 *	t:	tree
218 *	flags:	R_NEXT, R_PREV
219 *
220 * Side effects:
221 *	Pins the page the new key/data record is on.
222 *
223 * Returns:
224 *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
225 */
226static int
227__bt_seqadv(BTREE *t, EPG *ep, int flags)
228{
229	CURSOR *c;
230	PAGE *h;
231	indx_t idx;
232	pgno_t pg;
233	int exact;
234
235	/*
236	 * There are a couple of states that we can be in.  The cursor has
237	 * been initialized by the time we get here, but that's all we know.
238	 */
239	c = &t->bt_cursor;
240
241	/*
242	 * The cursor was deleted where there weren't any duplicate records,
243	 * so the key was saved.  Find out where that key would go in the
244	 * current tree.  It doesn't matter if the returned key is an exact
245	 * match or not -- if it's an exact match, the record was added after
246	 * the delete so we can just return it.  If not, as long as there's
247	 * a record there, return it.
248	 */
249	if (F_ISSET(c, CURS_ACQUIRE))
250		return (__bt_first(t, &c->key, ep, &exact));
251
252	/* Get the page referenced by the cursor. */
253	if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
254		return (RET_ERROR);
255
256	/*
257 	 * Find the next/previous record in the tree and point the cursor at
258	 * it.  The cursor may not be moved until a new key has been found.
259	 */
260	switch (flags) {
261	case R_NEXT:			/* Next record. */
262		/*
263		 * The cursor was deleted in duplicate records, and moved
264		 * forward to a record that has yet to be returned.  Clear
265		 * that flag, and return the record.
266		 */
267		if (F_ISSET(c, CURS_AFTER))
268			goto usecurrent;
269		idx = c->pg.index;
270		if (++idx == NEXTINDEX(h)) {
271			pg = h->nextpg;
272			mpool_put(t->bt_mp, h, 0);
273			if (pg == P_INVALID)
274				return (RET_SPECIAL);
275			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
276				return (RET_ERROR);
277			idx = 0;
278		}
279		break;
280	case R_PREV:			/* Previous record. */
281		/*
282		 * The cursor was deleted in duplicate records, and moved
283		 * backward to a record that has yet to be returned.  Clear
284		 * that flag, and return the record.
285		 */
286		if (F_ISSET(c, CURS_BEFORE)) {
287usecurrent:		F_CLR(c, CURS_AFTER | CURS_BEFORE);
288			ep->page = h;
289			ep->index = c->pg.index;
290			return (RET_SUCCESS);
291		}
292		idx = c->pg.index;
293		if (idx == 0) {
294			pg = h->prevpg;
295			mpool_put(t->bt_mp, h, 0);
296			if (pg == P_INVALID)
297				return (RET_SPECIAL);
298			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
299				return (RET_ERROR);
300			idx = NEXTINDEX(h) - 1;
301		} else
302			--idx;
303		break;
304	}
305
306	ep->page = h;
307	ep->index = idx;
308	return (RET_SUCCESS);
309}
310
311/*
312 * __bt_first --
313 *	Find the first entry.
314 *
315 * Parameters:
316 *	t:	the tree
317 *    key:	the key
318 *  erval:	return EPG
319 * exactp:	pointer to exact match flag
320 *
321 * Returns:
322 *	The first entry in the tree greater than or equal to key,
323 *	or RET_SPECIAL if no such key exists.
324 */
325static int
326__bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp)
327{
328	PAGE *h;
329	EPG *ep, save;
330	pgno_t pg;
331
332	/*
333	 * Find any matching record; __bt_search pins the page.
334	 *
335	 * If it's an exact match and duplicates are possible, walk backwards
336	 * in the tree until we find the first one.  Otherwise, make sure it's
337	 * a valid key (__bt_search may return an index just past the end of a
338	 * page) and return it.
339	 */
340	if ((ep = __bt_search(t, key, exactp)) == NULL)
341		return (0);
342	if (*exactp) {
343		if (F_ISSET(t, B_NODUPS)) {
344			*erval = *ep;
345			return (RET_SUCCESS);
346		}
347
348		/*
349		 * Walk backwards, as long as the entry matches and there are
350		 * keys left in the tree.  Save a copy of each match in case
351		 * we go too far.
352		 */
353		save = *ep;
354		h = ep->page;
355		do {
356			if (save.page->pgno != ep->page->pgno) {
357				mpool_put(t->bt_mp, save.page, 0);
358				save = *ep;
359			} else
360				save.index = ep->index;
361
362			/*
363			 * Don't unpin the page the last (or original) match
364			 * was on, but make sure it's unpinned if an error
365			 * occurs.
366			 */
367			if (ep->index == 0) {
368				if (h->prevpg == P_INVALID)
369					break;
370				if (h->pgno != save.page->pgno)
371					mpool_put(t->bt_mp, h, 0);
372				if ((h = mpool_get(t->bt_mp,
373				    h->prevpg, 0)) == NULL) {
374					if (h->pgno == save.page->pgno)
375						mpool_put(t->bt_mp,
376						    save.page, 0);
377					return (RET_ERROR);
378				}
379				ep->page = h;
380				ep->index = NEXTINDEX(h);
381			}
382			--ep->index;
383		} while (__bt_cmp(t, key, ep) == 0);
384
385		/*
386		 * Reach here with the last page that was looked at pinned,
387		 * which may or may not be the same as the last (or original)
388		 * match page.  If it's not useful, release it.
389		 */
390		if (h->pgno != save.page->pgno)
391			mpool_put(t->bt_mp, h, 0);
392
393		*erval = save;
394		return (RET_SUCCESS);
395	}
396
397	/* If at the end of a page, find the next entry. */
398	if (ep->index == NEXTINDEX(ep->page)) {
399		h = ep->page;
400		pg = h->nextpg;
401		mpool_put(t->bt_mp, h, 0);
402		if (pg == P_INVALID)
403			return (RET_SPECIAL);
404		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
405			return (RET_ERROR);
406		ep->index = 0;
407		ep->page = h;
408	}
409	*erval = *ep;
410	return (RET_SUCCESS);
411}
412
413/*
414 * __bt_setcur --
415 *	Set the cursor to an entry in the tree.
416 *
417 * Parameters:
418 *	t:	the tree
419 *   pgno:	page number
420 *    idx:	page index
421 */
422void
423__bt_setcur(BTREE *t, pgno_t pgno, u_int idx)
424{
425	/* Lose any already deleted key. */
426	if (t->bt_cursor.key.data != NULL) {
427		free(t->bt_cursor.key.data);
428		t->bt_cursor.key.size = 0;
429		t->bt_cursor.key.data = NULL;
430	}
431	F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
432
433	/* Update the cursor. */
434	t->bt_cursor.pg.pgno = pgno;
435	t->bt_cursor.pg.index = idx;
436	F_SET(&t->bt_cursor, CURS_INIT);
437}
438