bt_delete.c revision 1.9
1/*	$NetBSD: bt_delete.c,v 1.9 1997/07/21 14:06:31 jtc Exp $	*/
2
3/*-
4 * Copyright (c) 1990, 1993, 1994
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Mike Olson.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39#include <sys/cdefs.h>
40#if defined(LIBC_SCCS) && !defined(lint)
41#if 0
42static char sccsid[] = "@(#)bt_delete.c	8.13 (Berkeley) 7/28/94";
43#else
44__RCSID("$NetBSD: bt_delete.c,v 1.9 1997/07/21 14:06:31 jtc Exp $");
45#endif
46#endif /* LIBC_SCCS and not lint */
47
48#include "namespace.h"
49#include <sys/types.h>
50
51#include <errno.h>
52#include <stdio.h>
53#include <string.h>
54
55#include <db.h>
56#include "btree.h"
57
58static int __bt_bdelete __P((BTREE *, const DBT *));
59static int __bt_curdel __P((BTREE *, const DBT *, PAGE *, u_int));
60static int __bt_pdelete __P((BTREE *, PAGE *));
61static int __bt_relink __P((BTREE *, PAGE *));
62static int __bt_stkacq __P((BTREE *, PAGE **, CURSOR *));
63
64/*
65 * __bt_delete
66 *	Delete the item(s) referenced by a key.
67 *
68 * Return RET_SPECIAL if the key is not found.
69 */
70int
71__bt_delete(dbp, key, flags)
72	const DB *dbp;
73	const DBT *key;
74	u_int flags;
75{
76	BTREE *t;
77	CURSOR *c;
78	PAGE *h;
79	int status;
80
81	t = dbp->internal;
82
83	/* Toss any page pinned across calls. */
84	if (t->bt_pinned != NULL) {
85		mpool_put(t->bt_mp, t->bt_pinned, 0);
86		t->bt_pinned = NULL;
87	}
88
89	/* Check for change to a read-only tree. */
90	if (F_ISSET(t, B_RDONLY)) {
91		errno = EPERM;
92		return (RET_ERROR);
93	}
94
95	switch (flags) {
96	case 0:
97		status = __bt_bdelete(t, key);
98		break;
99	case R_CURSOR:
100		/*
101		 * If flags is R_CURSOR, delete the cursor.  Must already
102		 * have started a scan and not have already deleted it.
103		 */
104		c = &t->bt_cursor;
105		if (F_ISSET(c, CURS_INIT)) {
106			if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))
107				return (RET_SPECIAL);
108			if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
109				return (RET_ERROR);
110
111			/*
112			 * If the page is about to be emptied, we'll need to
113			 * delete it, which means we have to acquire a stack.
114			 */
115			if (NEXTINDEX(h) == 1)
116				if (__bt_stkacq(t, &h, &t->bt_cursor))
117					return (RET_ERROR);
118
119			status = __bt_dleaf(t, NULL, h, c->pg.index);
120
121			if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) {
122				if (__bt_pdelete(t, h))
123					return (RET_ERROR);
124			} else
125				mpool_put(t->bt_mp,
126				    h, status == RET_SUCCESS ? MPOOL_DIRTY : 0);
127			break;
128		}
129		/* FALLTHROUGH */
130	default:
131		errno = EINVAL;
132		return (RET_ERROR);
133	}
134	if (status == RET_SUCCESS)
135		F_SET(t, B_MODIFIED);
136	return (status);
137}
138
139/*
140 * __bt_stkacq --
141 *	Acquire a stack so we can delete a cursor entry.
142 *
143 * Parameters:
144 *	  t:	tree
145 *	 hp:	pointer to current, pinned PAGE pointer
146 *	  c:	pointer to the cursor
147 *
148 * Returns:
149 *	0 on success, 1 on failure
150 */
151static int
152__bt_stkacq(t, hp, c)
153	BTREE *t;
154	PAGE **hp;
155	CURSOR *c;
156{
157	BINTERNAL *bi;
158	EPG *e;
159	EPGNO *parent;
160	PAGE *h;
161	indx_t index = 0;	/* Pacify gcc */
162	pgno_t pgno;
163	recno_t nextpg, prevpg;
164	int exact, level;
165
166	/*
167	 * Find the first occurrence of the key in the tree.  Toss the
168	 * currently locked page so we don't hit an already-locked page.
169	 */
170	h = *hp;
171	mpool_put(t->bt_mp, h, 0);
172	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
173		return (1);
174	h = e->page;
175
176	/* See if we got it in one shot. */
177	if (h->pgno == c->pg.pgno)
178		goto ret;
179
180	/*
181	 * Move right, looking for the page.  At each move we have to move
182	 * up the stack until we don't have to move to the next page.  If
183	 * we have to change pages at an internal level, we have to fix the
184	 * stack back up.
185	 */
186	while (h->pgno != c->pg.pgno) {
187		if ((nextpg = h->nextpg) == P_INVALID)
188			break;
189		mpool_put(t->bt_mp, h, 0);
190
191		/* Move up the stack. */
192		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
193			/* Get the parent page. */
194			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
195				return (1);
196
197			/* Move to the next index. */
198			if (parent->index != NEXTINDEX(h) - 1) {
199				index = parent->index + 1;
200				BT_PUSH(t, h->pgno, index);
201				break;
202			}
203			mpool_put(t->bt_mp, h, 0);
204		}
205
206		/* Restore the stack. */
207		while (level--) {
208			/* Push the next level down onto the stack. */
209			bi = GETBINTERNAL(h, index);
210			pgno = bi->pgno;
211			BT_PUSH(t, pgno, 0);
212
213			/* Lose the currently pinned page. */
214			mpool_put(t->bt_mp, h, 0);
215
216			/* Get the next level down. */
217			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
218				return (1);
219			index = 0;
220		}
221		mpool_put(t->bt_mp, h, 0);
222		if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL)
223			return (1);
224	}
225
226	if (h->pgno == c->pg.pgno)
227		goto ret;
228
229	/* Reacquire the original stack. */
230	mpool_put(t->bt_mp, h, 0);
231	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
232		return (1);
233	h = e->page;
234
235	/*
236	 * Move left, looking for the page.  At each move we have to move
237	 * up the stack until we don't have to change pages to move to the
238	 * next page.  If we have to change pages at an internal level, we
239	 * have to fix the stack back up.
240	 */
241	while (h->pgno != c->pg.pgno) {
242		if ((prevpg = h->prevpg) == P_INVALID)
243			break;
244		mpool_put(t->bt_mp, h, 0);
245
246		/* Move up the stack. */
247		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
248			/* Get the parent page. */
249			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
250				return (1);
251
252			/* Move to the next index. */
253			if (parent->index != 0) {
254				index = parent->index - 1;
255				BT_PUSH(t, h->pgno, index);
256				break;
257			}
258			mpool_put(t->bt_mp, h, 0);
259		}
260
261		/* Restore the stack. */
262		while (level--) {
263			/* Push the next level down onto the stack. */
264			bi = GETBINTERNAL(h, index);
265			pgno = bi->pgno;
266
267			/* Lose the currently pinned page. */
268			mpool_put(t->bt_mp, h, 0);
269
270			/* Get the next level down. */
271			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
272				return (1);
273
274			index = NEXTINDEX(h) - 1;
275			BT_PUSH(t, pgno, index);
276		}
277		mpool_put(t->bt_mp, h, 0);
278		if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL)
279			return (1);
280	}
281
282
283ret:	mpool_put(t->bt_mp, h, 0);
284	return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL);
285}
286
287/*
288 * __bt_bdelete --
289 *	Delete all key/data pairs matching the specified key.
290 *
291 * Parameters:
292 *	  t:	tree
293 *	key:	key to delete
294 *
295 * Returns:
296 *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
297 */
298static int
299__bt_bdelete(t, key)
300	BTREE *t;
301	const DBT *key;
302{
303	EPG *e;
304	PAGE *h;
305	int deleted, exact, redo;
306
307	deleted = 0;
308
309	/* Find any matching record; __bt_search pins the page. */
310loop:	if ((e = __bt_search(t, key, &exact)) == NULL)
311		return (deleted ? RET_SUCCESS : RET_ERROR);
312	if (!exact) {
313		mpool_put(t->bt_mp, e->page, 0);
314		return (deleted ? RET_SUCCESS : RET_SPECIAL);
315	}
316
317	/*
318	 * Delete forward, then delete backward, from the found key.  If
319	 * there are duplicates and we reach either side of the page, do
320	 * the key search again, so that we get them all.
321	 */
322	redo = 0;
323	h = e->page;
324	do {
325		if (__bt_dleaf(t, key, h, e->index)) {
326			mpool_put(t->bt_mp, h, 0);
327			return (RET_ERROR);
328		}
329		if (F_ISSET(t, B_NODUPS)) {
330			if (NEXTINDEX(h) == 0) {
331				if (__bt_pdelete(t, h))
332					return (RET_ERROR);
333			} else
334				mpool_put(t->bt_mp, h, MPOOL_DIRTY);
335			return (RET_SUCCESS);
336		}
337		deleted = 1;
338	} while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0);
339
340	/* Check for right-hand edge of the page. */
341	if (e->index == NEXTINDEX(h))
342		redo = 1;
343
344	/* Delete from the key to the beginning of the page. */
345	while (e->index-- > 0) {
346		if (__bt_cmp(t, key, e) != 0)
347			break;
348		if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) {
349			mpool_put(t->bt_mp, h, 0);
350			return (RET_ERROR);
351		}
352		if (e->index == 0)
353			redo = 1;
354	}
355
356	/* Check for an empty page. */
357	if (NEXTINDEX(h) == 0) {
358		if (__bt_pdelete(t, h))
359			return (RET_ERROR);
360		goto loop;
361	}
362
363	/* Put the page. */
364	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
365
366	if (redo)
367		goto loop;
368	return (RET_SUCCESS);
369}
370
371/*
372 * __bt_pdelete --
373 *	Delete a single page from the tree.
374 *
375 * Parameters:
376 *	t:	tree
377 *	h:	leaf page
378 *
379 * Returns:
380 *	RET_SUCCESS, RET_ERROR.
381 *
382 * Side-effects:
383 *	mpool_put's the page
384 */
385static int
386__bt_pdelete(t, h)
387	BTREE *t;
388	PAGE *h;
389{
390	BINTERNAL *bi;
391	PAGE *pg;
392	EPGNO *parent;
393	indx_t cnt, index, *ip, offset;
394	u_int32_t nksize;
395	char *from;
396
397	/*
398	 * Walk the parent page stack -- a LIFO stack of the pages that were
399	 * traversed when we searched for the page where the delete occurred.
400	 * Each stack entry is a page number and a page index offset.  The
401	 * offset is for the page traversed on the search.  We've just deleted
402	 * a page, so we have to delete the key from the parent page.
403	 *
404	 * If the delete from the parent page makes it empty, this process may
405	 * continue all the way up the tree.  We stop if we reach the root page
406	 * (which is never deleted, it's just not worth the effort) or if the
407	 * delete does not empty the page.
408	 */
409	while ((parent = BT_POP(t)) != NULL) {
410		/* Get the parent page. */
411		if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
412			return (RET_ERROR);
413
414		index = parent->index;
415		bi = GETBINTERNAL(pg, index);
416
417		/* Free any overflow pages. */
418		if (bi->flags & P_BIGKEY &&
419		    __ovfl_delete(t, bi->bytes) == RET_ERROR) {
420			mpool_put(t->bt_mp, pg, 0);
421			return (RET_ERROR);
422		}
423
424		/*
425		 * Free the parent if it has only the one key and it's not the
426		 * root page. If it's the rootpage, turn it back into an empty
427		 * leaf page.
428		 */
429		if (NEXTINDEX(pg) == 1)
430			if (pg->pgno == P_ROOT) {
431				pg->lower = BTDATAOFF;
432				pg->upper = t->bt_psize;
433				pg->flags = P_BLEAF;
434			} else {
435				if (__bt_relink(t, pg) || __bt_free(t, pg))
436					return (RET_ERROR);
437				continue;
438			}
439		else {
440			/* Pack remaining key items at the end of the page. */
441			nksize = NBINTERNAL(bi->ksize);
442			from = (char *)pg + pg->upper;
443			memmove(from + nksize, from, (char *)bi - from);
444			pg->upper += nksize;
445
446			/* Adjust indices' offsets, shift the indices down. */
447			offset = pg->linp[index];
448			for (cnt = index, ip = &pg->linp[0]; cnt--; ++ip)
449				if (ip[0] < offset)
450					ip[0] += nksize;
451			for (cnt = NEXTINDEX(pg) - index; --cnt; ++ip)
452				ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1];
453			pg->lower -= sizeof(indx_t);
454		}
455
456		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
457		break;
458	}
459
460	/* Free the leaf page, as long as it wasn't the root. */
461	if (h->pgno == P_ROOT) {
462		mpool_put(t->bt_mp, h, MPOOL_DIRTY);
463		return (RET_SUCCESS);
464	}
465	return (__bt_relink(t, h) || __bt_free(t, h));
466}
467
468/*
469 * __bt_dleaf --
470 *	Delete a single record from a leaf page.
471 *
472 * Parameters:
473 *	t:	tree
474 *    key:	referenced key
475 *	h:	page
476 *	index:	index on page to delete
477 *
478 * Returns:
479 *	RET_SUCCESS, RET_ERROR.
480 */
481int
482__bt_dleaf(t, key, h, index)
483	BTREE *t;
484	const DBT *key;
485	PAGE *h;
486	u_int index;
487{
488	BLEAF *bl;
489	indx_t cnt, *ip, offset;
490	u_int32_t nbytes;
491	void *to;
492	char *from;
493
494	/* If this record is referenced by the cursor, delete the cursor. */
495	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
496	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
497	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == index &&
498	    __bt_curdel(t, key, h, index))
499		return (RET_ERROR);
500
501	/* If the entry uses overflow pages, make them available for reuse. */
502	to = bl = GETBLEAF(h, index);
503	if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR)
504		return (RET_ERROR);
505	if (bl->flags & P_BIGDATA &&
506	    __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR)
507		return (RET_ERROR);
508
509	/* Pack the remaining key/data items at the end of the page. */
510	nbytes = NBLEAF(bl);
511	from = (char *)h + h->upper;
512	memmove(from + nbytes, from, (char *)to - from);
513	h->upper += nbytes;
514
515	/* Adjust the indices' offsets, shift the indices down. */
516	offset = h->linp[index];
517	for (cnt = index, ip = &h->linp[0]; cnt--; ++ip)
518		if (ip[0] < offset)
519			ip[0] += nbytes;
520	for (cnt = NEXTINDEX(h) - index; --cnt; ++ip)
521		ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];
522	h->lower -= sizeof(indx_t);
523
524	/* If the cursor is on this page, adjust it as necessary. */
525	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
526	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
527	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > index)
528		--t->bt_cursor.pg.index;
529
530	return (RET_SUCCESS);
531}
532
533/*
534 * __bt_curdel --
535 *	Delete the cursor.
536 *
537 * Parameters:
538 *	t:	tree
539 *    key:	referenced key (or NULL)
540 *	h:	page
541 *  index:	index on page to delete
542 *
543 * Returns:
544 *	RET_SUCCESS, RET_ERROR.
545 */
546static int
547__bt_curdel(t, key, h, index)
548	BTREE *t;
549	const DBT *key;
550	PAGE *h;
551	u_int index;
552{
553	CURSOR *c;
554	EPG e;
555	PAGE *pg;
556	int curcopy, status;
557
558	/*
559	 * If there are duplicates, move forward or backward to one.
560	 * Otherwise, copy the key into the cursor area.
561	 */
562	c = &t->bt_cursor;
563	F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE);
564
565	curcopy = 0;
566	if (!F_ISSET(t, B_NODUPS)) {
567		/*
568		 * We're going to have to do comparisons.  If we weren't
569		 * provided a copy of the key, i.e. the user is deleting
570		 * the current cursor position, get one.
571		 */
572		if (key == NULL) {
573			e.page = h;
574			e.index = index;
575			if ((status = __bt_ret(t, &e,
576			    &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS)
577				return (status);
578			curcopy = 1;
579			key = &c->key;
580		}
581		/* Check previous key, if not at the beginning of the page. */
582		if (index > 0) {
583			e.page = h;
584			e.index = index - 1;
585			if (__bt_cmp(t, key, &e) == 0) {
586				F_SET(c, CURS_BEFORE);
587				goto dup2;
588			}
589		}
590		/* Check next key, if not at the end of the page. */
591		if (index < NEXTINDEX(h) - 1) {
592			e.page = h;
593			e.index = index + 1;
594			if (__bt_cmp(t, key, &e) == 0) {
595				F_SET(c, CURS_AFTER);
596				goto dup2;
597			}
598		}
599		/* Check previous key if at the beginning of the page. */
600		if (index == 0 && h->prevpg != P_INVALID) {
601			if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
602				return (RET_ERROR);
603			e.page = pg;
604			e.index = NEXTINDEX(pg) - 1;
605			if (__bt_cmp(t, key, &e) == 0) {
606				F_SET(c, CURS_BEFORE);
607				goto dup1;
608			}
609			mpool_put(t->bt_mp, pg, 0);
610		}
611		/* Check next key if at the end of the page. */
612		if (index == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) {
613			if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
614				return (RET_ERROR);
615			e.page = pg;
616			e.index = 0;
617			if (__bt_cmp(t, key, &e) == 0) {
618				F_SET(c, CURS_AFTER);
619dup1:				mpool_put(t->bt_mp, pg, 0);
620dup2:				c->pg.pgno = e.page->pgno;
621				c->pg.index = e.index;
622				return (RET_SUCCESS);
623			}
624			mpool_put(t->bt_mp, pg, 0);
625		}
626	}
627	e.page = h;
628	e.index = index;
629	if (curcopy || (status =
630	    __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) {
631		F_SET(c, CURS_ACQUIRE);
632		return (RET_SUCCESS);
633	}
634	return (status);
635}
636
637/*
638 * __bt_relink --
639 *	Link around a deleted page.
640 *
641 * Parameters:
642 *	t:	tree
643 *	h:	page to be deleted
644 */
645static int
646__bt_relink(t, h)
647	BTREE *t;
648	PAGE *h;
649{
650	PAGE *pg;
651
652	if (h->nextpg != P_INVALID) {
653		if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
654			return (RET_ERROR);
655		pg->prevpg = h->prevpg;
656		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
657	}
658	if (h->prevpg != P_INVALID) {
659		if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
660			return (RET_ERROR);
661		pg->nextpg = h->nextpg;
662		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
663	}
664	return (0);
665}
666