bt_delete.c revision 1.7
1/*	$NetBSD: bt_delete.c,v 1.7 1996/05/03 21:50:44 cgd Exp $	*/
2
3/*-
4 * Copyright (c) 1990, 1993, 1994
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Mike Olson.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39#if defined(LIBC_SCCS) && !defined(lint)
40#if 0
41static char sccsid[] = "@(#)bt_delete.c	8.13 (Berkeley) 7/28/94";
42#else
43static char rcsid[] = "$NetBSD: bt_delete.c,v 1.7 1996/05/03 21:50:44 cgd Exp $";
44#endif
45#endif /* LIBC_SCCS and not lint */
46
47#include <sys/types.h>
48
49#include <errno.h>
50#include <stdio.h>
51#include <string.h>
52
53#include <db.h>
54#include "btree.h"
55
56static int __bt_bdelete __P((BTREE *, const DBT *));
57static int __bt_curdel __P((BTREE *, const DBT *, PAGE *, u_int));
58static int __bt_pdelete __P((BTREE *, PAGE *));
59static int __bt_relink __P((BTREE *, PAGE *));
60static int __bt_stkacq __P((BTREE *, PAGE **, CURSOR *));
61
62/*
63 * __bt_delete
64 *	Delete the item(s) referenced by a key.
65 *
66 * Return RET_SPECIAL if the key is not found.
67 */
68int
69__bt_delete(dbp, key, flags)
70	const DB *dbp;
71	const DBT *key;
72	u_int flags;
73{
74	BTREE *t;
75	CURSOR *c;
76	PAGE *h;
77	int status;
78
79	t = dbp->internal;
80
81	/* Toss any page pinned across calls. */
82	if (t->bt_pinned != NULL) {
83		mpool_put(t->bt_mp, t->bt_pinned, 0);
84		t->bt_pinned = NULL;
85	}
86
87	/* Check for change to a read-only tree. */
88	if (F_ISSET(t, B_RDONLY)) {
89		errno = EPERM;
90		return (RET_ERROR);
91	}
92
93	switch (flags) {
94	case 0:
95		status = __bt_bdelete(t, key);
96		break;
97	case R_CURSOR:
98		/*
99		 * If flags is R_CURSOR, delete the cursor.  Must already
100		 * have started a scan and not have already deleted it.
101		 */
102		c = &t->bt_cursor;
103		if (F_ISSET(c, CURS_INIT)) {
104			if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))
105				return (RET_SPECIAL);
106			if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
107				return (RET_ERROR);
108
109			/*
110			 * If the page is about to be emptied, we'll need to
111			 * delete it, which means we have to acquire a stack.
112			 */
113			if (NEXTINDEX(h) == 1)
114				if (__bt_stkacq(t, &h, &t->bt_cursor))
115					return (RET_ERROR);
116
117			status = __bt_dleaf(t, NULL, h, c->pg.index);
118
119			if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) {
120				if (__bt_pdelete(t, h))
121					return (RET_ERROR);
122			} else
123				mpool_put(t->bt_mp,
124				    h, status == RET_SUCCESS ? MPOOL_DIRTY : 0);
125			break;
126		}
127		/* FALLTHROUGH */
128	default:
129		errno = EINVAL;
130		return (RET_ERROR);
131	}
132	if (status == RET_SUCCESS)
133		F_SET(t, B_MODIFIED);
134	return (status);
135}
136
137/*
138 * __bt_stkacq --
139 *	Acquire a stack so we can delete a cursor entry.
140 *
141 * Parameters:
142 *	  t:	tree
143 *	 hp:	pointer to current, pinned PAGE pointer
144 *	  c:	pointer to the cursor
145 *
146 * Returns:
147 *	0 on success, 1 on failure
148 */
149static int
150__bt_stkacq(t, hp, c)
151	BTREE *t;
152	PAGE **hp;
153	CURSOR *c;
154{
155	BINTERNAL *bi;
156	EPG *e;
157	EPGNO *parent;
158	PAGE *h;
159	indx_t index;
160	pgno_t pgno;
161	recno_t nextpg, prevpg;
162	int exact, level;
163
164	/*
165	 * Find the first occurrence of the key in the tree.  Toss the
166	 * currently locked page so we don't hit an already-locked page.
167	 */
168	h = *hp;
169	mpool_put(t->bt_mp, h, 0);
170	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
171		return (1);
172	h = e->page;
173
174	/* See if we got it in one shot. */
175	if (h->pgno == c->pg.pgno)
176		goto ret;
177
178	/*
179	 * Move right, looking for the page.  At each move we have to move
180	 * up the stack until we don't have to move to the next page.  If
181	 * we have to change pages at an internal level, we have to fix the
182	 * stack back up.
183	 */
184	while (h->pgno != c->pg.pgno) {
185		if ((nextpg = h->nextpg) == P_INVALID)
186			break;
187		mpool_put(t->bt_mp, h, 0);
188
189		/* Move up the stack. */
190		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
191			/* Get the parent page. */
192			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
193				return (1);
194
195			/* Move to the next index. */
196			if (parent->index != NEXTINDEX(h) - 1) {
197				index = parent->index + 1;
198				BT_PUSH(t, h->pgno, index);
199				break;
200			}
201			mpool_put(t->bt_mp, h, 0);
202		}
203
204		/* Restore the stack. */
205		while (level--) {
206			/* Push the next level down onto the stack. */
207			bi = GETBINTERNAL(h, index);
208			pgno = bi->pgno;
209			BT_PUSH(t, pgno, 0);
210
211			/* Lose the currently pinned page. */
212			mpool_put(t->bt_mp, h, 0);
213
214			/* Get the next level down. */
215			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
216				return (1);
217			index = 0;
218		}
219		mpool_put(t->bt_mp, h, 0);
220		if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL)
221			return (1);
222	}
223
224	if (h->pgno == c->pg.pgno)
225		goto ret;
226
227	/* Reacquire the original stack. */
228	mpool_put(t->bt_mp, h, 0);
229	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
230		return (1);
231	h = e->page;
232
233	/*
234	 * Move left, looking for the page.  At each move we have to move
235	 * up the stack until we don't have to change pages to move to the
236	 * next page.  If we have to change pages at an internal level, we
237	 * have to fix the stack back up.
238	 */
239	while (h->pgno != c->pg.pgno) {
240		if ((prevpg = h->prevpg) == P_INVALID)
241			break;
242		mpool_put(t->bt_mp, h, 0);
243
244		/* Move up the stack. */
245		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
246			/* Get the parent page. */
247			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
248				return (1);
249
250			/* Move to the next index. */
251			if (parent->index != 0) {
252				index = parent->index - 1;
253				BT_PUSH(t, h->pgno, index);
254				break;
255			}
256			mpool_put(t->bt_mp, h, 0);
257		}
258
259		/* Restore the stack. */
260		while (level--) {
261			/* Push the next level down onto the stack. */
262			bi = GETBINTERNAL(h, index);
263			pgno = bi->pgno;
264
265			/* Lose the currently pinned page. */
266			mpool_put(t->bt_mp, h, 0);
267
268			/* Get the next level down. */
269			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
270				return (1);
271
272			index = NEXTINDEX(h) - 1;
273			BT_PUSH(t, pgno, index);
274		}
275		mpool_put(t->bt_mp, h, 0);
276		if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL)
277			return (1);
278	}
279
280
281ret:	mpool_put(t->bt_mp, h, 0);
282	return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL);
283}
284
285/*
286 * __bt_bdelete --
287 *	Delete all key/data pairs matching the specified key.
288 *
289 * Parameters:
290 *	  t:	tree
291 *	key:	key to delete
292 *
293 * Returns:
294 *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
295 */
296static int
297__bt_bdelete(t, key)
298	BTREE *t;
299	const DBT *key;
300{
301	EPG *e;
302	PAGE *h;
303	int deleted, exact, redo;
304
305	deleted = 0;
306
307	/* Find any matching record; __bt_search pins the page. */
308loop:	if ((e = __bt_search(t, key, &exact)) == NULL)
309		return (deleted ? RET_SUCCESS : RET_ERROR);
310	if (!exact) {
311		mpool_put(t->bt_mp, e->page, 0);
312		return (deleted ? RET_SUCCESS : RET_SPECIAL);
313	}
314
315	/*
316	 * Delete forward, then delete backward, from the found key.  If
317	 * there are duplicates and we reach either side of the page, do
318	 * the key search again, so that we get them all.
319	 */
320	redo = 0;
321	h = e->page;
322	do {
323		if (__bt_dleaf(t, key, h, e->index)) {
324			mpool_put(t->bt_mp, h, 0);
325			return (RET_ERROR);
326		}
327		if (F_ISSET(t, B_NODUPS)) {
328			if (NEXTINDEX(h) == 0) {
329				if (__bt_pdelete(t, h))
330					return (RET_ERROR);
331			} else
332				mpool_put(t->bt_mp, h, MPOOL_DIRTY);
333			return (RET_SUCCESS);
334		}
335		deleted = 1;
336	} while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0);
337
338	/* Check for right-hand edge of the page. */
339	if (e->index == NEXTINDEX(h))
340		redo = 1;
341
342	/* Delete from the key to the beginning of the page. */
343	while (e->index-- > 0) {
344		if (__bt_cmp(t, key, e) != 0)
345			break;
346		if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) {
347			mpool_put(t->bt_mp, h, 0);
348			return (RET_ERROR);
349		}
350		if (e->index == 0)
351			redo = 1;
352	}
353
354	/* Check for an empty page. */
355	if (NEXTINDEX(h) == 0) {
356		if (__bt_pdelete(t, h))
357			return (RET_ERROR);
358		goto loop;
359	}
360
361	/* Put the page. */
362	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
363
364	if (redo)
365		goto loop;
366	return (RET_SUCCESS);
367}
368
369/*
370 * __bt_pdelete --
371 *	Delete a single page from the tree.
372 *
373 * Parameters:
374 *	t:	tree
375 *	h:	leaf page
376 *
377 * Returns:
378 *	RET_SUCCESS, RET_ERROR.
379 *
380 * Side-effects:
381 *	mpool_put's the page
382 */
383static int
384__bt_pdelete(t, h)
385	BTREE *t;
386	PAGE *h;
387{
388	BINTERNAL *bi;
389	PAGE *pg;
390	EPGNO *parent;
391	indx_t cnt, index, *ip, offset;
392	u_int32_t nksize;
393	char *from;
394
395	/*
396	 * Walk the parent page stack -- a LIFO stack of the pages that were
397	 * traversed when we searched for the page where the delete occurred.
398	 * Each stack entry is a page number and a page index offset.  The
399	 * offset is for the page traversed on the search.  We've just deleted
400	 * a page, so we have to delete the key from the parent page.
401	 *
402	 * If the delete from the parent page makes it empty, this process may
403	 * continue all the way up the tree.  We stop if we reach the root page
404	 * (which is never deleted, it's just not worth the effort) or if the
405	 * delete does not empty the page.
406	 */
407	while ((parent = BT_POP(t)) != NULL) {
408		/* Get the parent page. */
409		if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
410			return (RET_ERROR);
411
412		index = parent->index;
413		bi = GETBINTERNAL(pg, index);
414
415		/* Free any overflow pages. */
416		if (bi->flags & P_BIGKEY &&
417		    __ovfl_delete(t, bi->bytes) == RET_ERROR) {
418			mpool_put(t->bt_mp, pg, 0);
419			return (RET_ERROR);
420		}
421
422		/*
423		 * Free the parent if it has only the one key and it's not the
424		 * root page. If it's the rootpage, turn it back into an empty
425		 * leaf page.
426		 */
427		if (NEXTINDEX(pg) == 1)
428			if (pg->pgno == P_ROOT) {
429				pg->lower = BTDATAOFF;
430				pg->upper = t->bt_psize;
431				pg->flags = P_BLEAF;
432			} else {
433				if (__bt_relink(t, pg) || __bt_free(t, pg))
434					return (RET_ERROR);
435				continue;
436			}
437		else {
438			/* Pack remaining key items at the end of the page. */
439			nksize = NBINTERNAL(bi->ksize);
440			from = (char *)pg + pg->upper;
441			memmove(from + nksize, from, (char *)bi - from);
442			pg->upper += nksize;
443
444			/* Adjust indices' offsets, shift the indices down. */
445			offset = pg->linp[index];
446			for (cnt = index, ip = &pg->linp[0]; cnt--; ++ip)
447				if (ip[0] < offset)
448					ip[0] += nksize;
449			for (cnt = NEXTINDEX(pg) - index; --cnt; ++ip)
450				ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1];
451			pg->lower -= sizeof(indx_t);
452		}
453
454		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
455		break;
456	}
457
458	/* Free the leaf page, as long as it wasn't the root. */
459	if (h->pgno == P_ROOT) {
460		mpool_put(t->bt_mp, h, MPOOL_DIRTY);
461		return (RET_SUCCESS);
462	}
463	return (__bt_relink(t, h) || __bt_free(t, h));
464}
465
466/*
467 * __bt_dleaf --
468 *	Delete a single record from a leaf page.
469 *
470 * Parameters:
471 *	t:	tree
472 *    key:	referenced key
473 *	h:	page
474 *	index:	index on page to delete
475 *
476 * Returns:
477 *	RET_SUCCESS, RET_ERROR.
478 */
479int
480__bt_dleaf(t, key, h, index)
481	BTREE *t;
482	const DBT *key;
483	PAGE *h;
484	u_int index;
485{
486	BLEAF *bl;
487	indx_t cnt, *ip, offset;
488	u_int32_t nbytes;
489	void *to;
490	char *from;
491
492	/* If this record is referenced by the cursor, delete the cursor. */
493	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
494	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
495	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == index &&
496	    __bt_curdel(t, key, h, index))
497		return (RET_ERROR);
498
499	/* If the entry uses overflow pages, make them available for reuse. */
500	to = bl = GETBLEAF(h, index);
501	if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR)
502		return (RET_ERROR);
503	if (bl->flags & P_BIGDATA &&
504	    __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR)
505		return (RET_ERROR);
506
507	/* Pack the remaining key/data items at the end of the page. */
508	nbytes = NBLEAF(bl);
509	from = (char *)h + h->upper;
510	memmove(from + nbytes, from, (char *)to - from);
511	h->upper += nbytes;
512
513	/* Adjust the indices' offsets, shift the indices down. */
514	offset = h->linp[index];
515	for (cnt = index, ip = &h->linp[0]; cnt--; ++ip)
516		if (ip[0] < offset)
517			ip[0] += nbytes;
518	for (cnt = NEXTINDEX(h) - index; --cnt; ++ip)
519		ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];
520	h->lower -= sizeof(indx_t);
521
522	/* If the cursor is on this page, adjust it as necessary. */
523	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
524	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
525	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > index)
526		--t->bt_cursor.pg.index;
527
528	return (RET_SUCCESS);
529}
530
531/*
532 * __bt_curdel --
533 *	Delete the cursor.
534 *
535 * Parameters:
536 *	t:	tree
537 *    key:	referenced key (or NULL)
538 *	h:	page
539 *  index:	index on page to delete
540 *
541 * Returns:
542 *	RET_SUCCESS, RET_ERROR.
543 */
544static int
545__bt_curdel(t, key, h, index)
546	BTREE *t;
547	const DBT *key;
548	PAGE *h;
549	u_int index;
550{
551	CURSOR *c;
552	EPG e;
553	PAGE *pg;
554	int curcopy, status;
555
556	/*
557	 * If there are duplicates, move forward or backward to one.
558	 * Otherwise, copy the key into the cursor area.
559	 */
560	c = &t->bt_cursor;
561	F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE);
562
563	curcopy = 0;
564	if (!F_ISSET(t, B_NODUPS)) {
565		/*
566		 * We're going to have to do comparisons.  If we weren't
567		 * provided a copy of the key, i.e. the user is deleting
568		 * the current cursor position, get one.
569		 */
570		if (key == NULL) {
571			e.page = h;
572			e.index = index;
573			if ((status = __bt_ret(t, &e,
574			    &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS)
575				return (status);
576			curcopy = 1;
577			key = &c->key;
578		}
579		/* Check previous key, if not at the beginning of the page. */
580		if (index > 0) {
581			e.page = h;
582			e.index = index - 1;
583			if (__bt_cmp(t, key, &e) == 0) {
584				F_SET(c, CURS_BEFORE);
585				goto dup2;
586			}
587		}
588		/* Check next key, if not at the end of the page. */
589		if (index < NEXTINDEX(h) - 1) {
590			e.page = h;
591			e.index = index + 1;
592			if (__bt_cmp(t, key, &e) == 0) {
593				F_SET(c, CURS_AFTER);
594				goto dup2;
595			}
596		}
597		/* Check previous key if at the beginning of the page. */
598		if (index == 0 && h->prevpg != P_INVALID) {
599			if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
600				return (RET_ERROR);
601			e.page = pg;
602			e.index = NEXTINDEX(pg) - 1;
603			if (__bt_cmp(t, key, &e) == 0) {
604				F_SET(c, CURS_BEFORE);
605				goto dup1;
606			}
607			mpool_put(t->bt_mp, pg, 0);
608		}
609		/* Check next key if at the end of the page. */
610		if (index == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) {
611			if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
612				return (RET_ERROR);
613			e.page = pg;
614			e.index = 0;
615			if (__bt_cmp(t, key, &e) == 0) {
616				F_SET(c, CURS_AFTER);
617dup1:				mpool_put(t->bt_mp, pg, 0);
618dup2:				c->pg.pgno = e.page->pgno;
619				c->pg.index = e.index;
620				return (RET_SUCCESS);
621			}
622			mpool_put(t->bt_mp, pg, 0);
623		}
624	}
625	e.page = h;
626	e.index = index;
627	if (curcopy || (status =
628	    __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) {
629		F_SET(c, CURS_ACQUIRE);
630		return (RET_SUCCESS);
631	}
632	return (status);
633}
634
635/*
636 * __bt_relink --
637 *	Link around a deleted page.
638 *
639 * Parameters:
640 *	t:	tree
641 *	h:	page to be deleted
642 */
643static int
644__bt_relink(t, h)
645	BTREE *t;
646	PAGE *h;
647{
648	PAGE *pg;
649
650	if (h->nextpg != P_INVALID) {
651		if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
652			return (RET_ERROR);
653		pg->prevpg = h->prevpg;
654		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
655	}
656	if (h->prevpg != P_INVALID) {
657		if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
658			return (RET_ERROR);
659		pg->nextpg = h->nextpg;
660		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
661	}
662	return (0);
663}
664