1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 *	Keith Bostic.  All rights reserved.
9 */
10/*
11 * Copyright (c) 1990, 1993, 1994, 1995
12 *	The Regents of the University of California.  All rights reserved.
13 *
14 * This code is derived from software contributed to Berkeley by
15 * Mike Olson.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 * $Id: bt_delete.c,v 12.30 2008/01/28 21:40:57 ubell Exp $
42 */
43
44#include "db_config.h"
45
46#include "db_int.h"
47#include "dbinc/db_page.h"
48#include "dbinc/btree.h"
49#include "dbinc/lock.h"
50#include "dbinc/mp.h"
51
52/*
53 * __bam_ditem --
54 *	Delete one or more entries from a page.
55 *
56 * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
57 */
58int
59__bam_ditem(dbc, h, indx)
60	DBC *dbc;
61	PAGE *h;
62	u_int32_t indx;
63{
64	BINTERNAL *bi;
65	BKEYDATA *bk;
66	DB *dbp;
67	u_int32_t nbytes;
68	int ret;
69	db_indx_t *inp;
70
71	dbp = dbc->dbp;
72	inp = P_INP(dbp, h);
73
74	/* The page should already have been dirtied by our caller. */
75	DB_ASSERT(dbp->env, IS_DIRTY(h));
76
77	switch (TYPE(h)) {
78	case P_IBTREE:
79		bi = GET_BINTERNAL(dbp, h, indx);
80		switch (B_TYPE(bi->type)) {
81		case B_DUPLICATE:
82		case B_KEYDATA:
83			nbytes = BINTERNAL_SIZE(bi->len);
84			break;
85		case B_OVERFLOW:
86			nbytes = BINTERNAL_SIZE(bi->len);
87			if ((ret =
88			    __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
89				return (ret);
90			break;
91		default:
92			return (__db_pgfmt(dbp->env, PGNO(h)));
93		}
94		break;
95	case P_IRECNO:
96		nbytes = RINTERNAL_SIZE;
97		break;
98	case P_LBTREE:
99		/*
100		 * If it's a duplicate key, discard the index and don't touch
101		 * the actual page item.
102		 *
103		 * !!!
104		 * This works because no data item can have an index matching
105		 * any other index so even if the data item is in a key "slot",
106		 * it won't match any other index.
107		 */
108		if ((indx % 2) == 0) {
109			/*
110			 * Check for a duplicate after us on the page.  NOTE:
111			 * we have to delete the key item before deleting the
112			 * data item, otherwise the "indx + P_INDX" calculation
113			 * won't work!
114			 */
115			if (indx + P_INDX < (u_int32_t)NUM_ENT(h) &&
116			    inp[indx] == inp[indx + P_INDX])
117				return (__bam_adjindx(dbc,
118				    h, indx, indx + O_INDX, 0));
119			/*
120			 * Check for a duplicate before us on the page.  It
121			 * doesn't matter if we delete the key item before or
122			 * after the data item for the purposes of this one.
123			 */
124			if (indx > 0 && inp[indx] == inp[indx - P_INDX])
125				return (__bam_adjindx(dbc,
126				    h, indx, indx - P_INDX, 0));
127		}
128		/* FALLTHROUGH */
129	case P_LDUP:
130	case P_LRECNO:
131		bk = GET_BKEYDATA(dbp, h, indx);
132		switch (B_TYPE(bk->type)) {
133		case B_DUPLICATE:
134			nbytes = BOVERFLOW_SIZE;
135			break;
136		case B_OVERFLOW:
137			nbytes = BOVERFLOW_SIZE;
138			if ((ret = __db_doff(
139			    dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0)
140				return (ret);
141			break;
142		case B_KEYDATA:
143			nbytes = BKEYDATA_SIZE(bk->len);
144			break;
145		default:
146			return (__db_pgfmt(dbp->env, PGNO(h)));
147		}
148		break;
149	default:
150		return (__db_pgfmt(dbp->env, PGNO(h)));
151	}
152
153	/* Delete the item and mark the page dirty. */
154	if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0)
155		return (ret);
156
157	return (0);
158}
159
160/*
161 * __bam_adjindx --
162 *	Adjust an index on the page.
163 *
164 * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
165 */
166int
167__bam_adjindx(dbc, h, indx, indx_copy, is_insert)
168	DBC *dbc;
169	PAGE *h;
170	u_int32_t indx, indx_copy;
171	int is_insert;
172{
173	DB *dbp;
174	db_indx_t copy, *inp;
175	int ret;
176
177	dbp = dbc->dbp;
178	inp = P_INP(dbp, h);
179
180	/* Log the change. */
181	if (DBC_LOGGING(dbc)) {
182	    if ((ret = __bam_adj_log(dbp, dbc->txn, &LSN(h), 0,
183		PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0)
184			return (ret);
185	} else
186		LSN_NOT_LOGGED(LSN(h));
187
188	/* Shuffle the indices and mark the page dirty. */
189	if (is_insert) {
190		copy = inp[indx_copy];
191		if (indx != NUM_ENT(h))
192			memmove(&inp[indx + O_INDX], &inp[indx],
193			    sizeof(db_indx_t) * (NUM_ENT(h) - indx));
194		inp[indx] = copy;
195		++NUM_ENT(h);
196	} else {
197		--NUM_ENT(h);
198		if (indx != NUM_ENT(h))
199			memmove(&inp[indx], &inp[indx + O_INDX],
200			    sizeof(db_indx_t) * (NUM_ENT(h) - indx));
201	}
202
203	return (0);
204}
205
206/*
207 * __bam_dpages --
208 *	Delete a set of locked pages.
209 *
210 * PUBLIC: int __bam_dpages __P((DBC *, int, int));
211 */
212int
213__bam_dpages(dbc, use_top, update)
214	DBC *dbc;
215	int use_top;
216	int update;
217{
218	BINTERNAL *bi;
219	BTREE_CURSOR *cp;
220	DB *dbp;
221	DBT a, b;
222	DB_LOCK c_lock, p_lock;
223	DB_MPOOLFILE *mpf;
224	EPG *epg, *save_sp, *stack_epg;
225	PAGE *child, *parent;
226	db_indx_t nitems;
227	db_pgno_t pgno, root_pgno;
228	db_recno_t rcnt;
229	int done, ret, t_ret;
230
231	dbp = dbc->dbp;
232	mpf = dbp->mpf;
233	cp = (BTREE_CURSOR *)dbc->internal;
234	nitems = 0;
235	pgno = PGNO_INVALID;
236
237	/*
238	 * We have the entire stack of deletable pages locked.
239	 *
240	 * Btree calls us with the first page in the stack is to have a
241	 * single item deleted, and the rest of the pages are to be removed.
242	 *
243	 * Recno always has a stack to the root and __bam_merge operations
244	 * may have unneeded items in the sack.  We find the lowest page
245	 * in the stack that has more than one record in it and start there.
246	 */
247	ret = 0;
248	if (use_top)
249		stack_epg = cp->sp;
250	else
251		for (stack_epg = cp->csp; stack_epg > cp->sp; --stack_epg)
252			if (NUM_ENT(stack_epg->page) > 1)
253				break;
254	epg = stack_epg;
255	/*
256	 * !!!
257	 * There is an interesting deadlock situation here.  We have to relink
258	 * the leaf page chain around the leaf page being deleted.  Consider
259	 * a cursor walking through the leaf pages, that has the previous page
260	 * read-locked and is waiting on a lock for the page we're deleting.
261	 * It will deadlock here.  Before we unlink the subtree, we relink the
262	 * leaf page chain.
263	 */
264	if (LEVEL(cp->csp->page) == 1 &&
265	    (ret = __bam_relink(dbc, cp->csp->page, PGNO_INVALID)) != 0)
266		goto discard;
267
268	/*
269	 * Delete the last item that references the underlying pages that are
270	 * to be deleted, and adjust cursors that reference that page.  Then,
271	 * save that page's page number and item count and release it.  If
272	 * the application isn't retaining locks because it's running without
273	 * transactions, this lets the rest of the tree get back to business
274	 * immediately.
275	 */
276	if ((ret = __memp_dirty(mpf,
277	    &epg->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
278		goto discard;
279	if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
280		goto discard;
281	if ((ret = __bam_ca_di(dbc, PGNO(epg->page), epg->indx, -1)) != 0)
282		goto discard;
283
284	if (update && epg->indx == 0) {
285		save_sp = cp->csp;
286		cp->csp = epg;
287		ret = __bam_pupdate(dbc, epg->page);
288		cp->csp = save_sp;
289		if (ret != 0)
290			goto discard;
291	}
292
293	pgno = PGNO(epg->page);
294	nitems = NUM_ENT(epg->page);
295
296	ret = __memp_fput(mpf, dbc->thread_info, epg->page, dbc->priority);
297	epg->page = NULL;
298	if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
299		ret = t_ret;
300	if (ret != 0)
301		goto err_inc;
302
303	/* Then, discard any pages that we don't care about. */
304discard: for (epg = cp->sp; epg < stack_epg; ++epg) {
305		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
306		     epg->page, dbc->priority)) != 0 && ret == 0)
307			ret = t_ret;
308		epg->page = NULL;
309		if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
310			ret = t_ret;
311	}
312	if (ret != 0)
313		goto err;
314
315	/* Free the rest of the pages in the stack. */
316	while (++epg <= cp->csp) {
317		if ((ret = __memp_dirty(mpf, &epg->page,
318		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
319			goto err;
320		/*
321		 * Delete page entries so they will be restored as part of
322		 * recovery.  We don't need to do cursor adjustment here as
323		 * the pages are being emptied by definition and so cannot
324		 * be referenced by a cursor.
325		 */
326		if (NUM_ENT(epg->page) != 0) {
327			DB_ASSERT(dbp->env, LEVEL(epg->page) != 1);
328
329			if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
330				goto err;
331			/*
332			 * Sheer paranoia: if we find any pages that aren't
333			 * emptied by the delete, someone else added an item
334			 * while we were walking the tree, and we discontinue
335			 * the delete.  Shouldn't be possible, but we check
336			 * regardless.
337			 */
338			if (NUM_ENT(epg->page) != 0)
339				goto err;
340		}
341
342		ret = __db_free(dbc, epg->page);
343		if (cp->page == epg->page)
344			cp->page = NULL;
345		epg->page = NULL;
346		if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
347			ret = t_ret;
348		if (ret != 0)
349			goto err_inc;
350	}
351
352	if (0) {
353err_inc:	++epg;
354err:		for (; epg <= cp->csp; ++epg) {
355			if (epg->page != NULL) {
356				(void)__memp_fput(mpf, dbc->thread_info,
357				     epg->page, dbc->priority);
358				epg->page = NULL;
359			}
360			(void)__TLPUT(dbc, epg->lock);
361		}
362		BT_STK_CLR(cp);
363		return (ret);
364	}
365	BT_STK_CLR(cp);
366
367	/*
368	 * If we just deleted the next-to-last item from the root page, the
369	 * tree can collapse one or more levels.  While there remains only a
370	 * single item on the root page, write lock the last page referenced
371	 * by the root page and copy it over the root page.
372	 */
373	root_pgno = cp->root;
374	if (pgno != root_pgno || nitems != 1)
375		return (0);
376
377	for (done = 0; !done;) {
378		/* Initialize. */
379		parent = child = NULL;
380		LOCK_INIT(p_lock);
381		LOCK_INIT(c_lock);
382
383		/* Lock the root. */
384		pgno = root_pgno;
385		if ((ret =
386		    __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &p_lock)) != 0)
387			goto stop;
388		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
389		    DB_MPOOL_DIRTY, &parent)) != 0)
390			goto stop;
391
392		if (NUM_ENT(parent) != 1)
393			goto stop;
394
395		switch (TYPE(parent)) {
396		case P_IBTREE:
397			/*
398			 * If this is overflow, then try to delete it.
399			 * The child may or may not still point at it.
400			 */
401			bi = GET_BINTERNAL(dbp, parent, 0);
402			if (B_TYPE(bi->type) == B_OVERFLOW)
403				if ((ret = __db_doff(dbc,
404				    ((BOVERFLOW *)bi->data)->pgno)) != 0)
405					goto stop;
406			pgno = bi->pgno;
407			break;
408		case P_IRECNO:
409			pgno = GET_RINTERNAL(dbp, parent, 0)->pgno;
410			break;
411		default:
412			goto stop;
413		}
414
415		/* Lock the child page. */
416		if ((ret =
417		    __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0)
418			goto stop;
419		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
420		    DB_MPOOL_DIRTY, &child)) != 0)
421			goto stop;
422
423		/* Log the change. */
424		if (DBC_LOGGING(dbc)) {
425			memset(&a, 0, sizeof(a));
426			a.data = child;
427			a.size = dbp->pgsize;
428			memset(&b, 0, sizeof(b));
429			b.data = P_ENTRY(dbp, parent, 0);
430			b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE :
431			    BINTERNAL_SIZE(((BINTERNAL *)b.data)->len);
432			if ((ret = __bam_rsplit_log(dbp, dbc->txn,
433			    &child->lsn, 0, PGNO(child), &a, PGNO(parent),
434			    RE_NREC(parent), &b, &parent->lsn)) != 0)
435				goto stop;
436		} else
437			LSN_NOT_LOGGED(child->lsn);
438
439		/*
440		 * Make the switch.
441		 *
442		 * One fixup -- internal pages below the top level do not store
443		 * a record count, so we have to preserve it if we're not
444		 * converting to a leaf page.  Note also that we are about to
445		 * overwrite the parent page, including its LSN.  This is OK
446		 * because the log message we wrote describing this update
447		 * stores its LSN on the child page.  When the child is copied
448		 * onto the parent, the correct LSN is copied into place.
449		 */
450		COMPQUIET(rcnt, 0);
451		if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
452			rcnt = RE_NREC(parent);
453		memcpy(parent, child, dbp->pgsize);
454		PGNO(parent) = root_pgno;
455		if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
456			RE_NREC_SET(parent, rcnt);
457
458		/* Adjust the cursors. */
459		if ((ret = __bam_ca_rsplit(dbc, PGNO(child), root_pgno)) != 0)
460			goto stop;
461
462		/*
463		 * Free the page copied onto the root page and discard its
464		 * lock.  (The call to __db_free() discards our reference
465		 * to the page.)
466		 */
467		if ((ret = __db_free(dbc, child)) != 0) {
468			child = NULL;
469			goto stop;
470		}
471		child = NULL;
472
473		if (0) {
474stop:			done = 1;
475		}
476		if ((t_ret = __TLPUT(dbc, p_lock)) != 0 && ret == 0)
477			ret = t_ret;
478		if (parent != NULL &&
479		    (t_ret = __memp_fput(mpf, dbc->thread_info,
480		    parent, dbc->priority)) != 0 && ret == 0)
481			ret = t_ret;
482		if ((t_ret = __TLPUT(dbc, c_lock)) != 0 && ret == 0)
483			ret = t_ret;
484		if (child != NULL &&
485		    (t_ret = __memp_fput(mpf, dbc->thread_info,
486		    child, dbc->priority)) != 0 && ret == 0)
487			ret = t_ret;
488	}
489
490	return (ret);
491}
492
493/*
494 * __bam_relink --
495 *	Relink around a deleted page.
496 *
497 * PUBLIC: int __bam_relink __P((DBC *, PAGE *, db_pgno_t));
498 */
499int
500__bam_relink(dbc, pagep, new_pgno)
501	DBC *dbc;
502	PAGE *pagep;
503	db_pgno_t new_pgno;
504{
505	DB *dbp;
506	DB_LOCK npl, ppl;
507	DB_LSN *nlsnp, *plsnp, ret_lsn;
508	DB_MPOOLFILE *mpf;
509	PAGE *np, *pp;
510	int ret, t_ret;
511
512	dbp = dbc->dbp;
513	np = pp = NULL;
514	LOCK_INIT(npl);
515	LOCK_INIT(ppl);
516	nlsnp = plsnp = NULL;
517	mpf = dbp->mpf;
518	ret = 0;
519
520	/*
521	 * Retrieve and lock the one/two pages.  For a remove, we may need
522	 * two pages (the before and after).  For an add, we only need one
523	 * because, the split took care of the prev.
524	 */
525	if (pagep->next_pgno != PGNO_INVALID) {
526		if ((ret = __db_lget(dbc,
527		    0, pagep->next_pgno, DB_LOCK_WRITE, 0, &npl)) != 0)
528			goto err;
529		if ((ret = __memp_fget(mpf, &pagep->next_pgno,
530		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &np)) != 0) {
531			ret = __db_pgerr(dbp, pagep->next_pgno, ret);
532			goto err;
533		}
534		nlsnp = &np->lsn;
535	}
536	if (pagep->prev_pgno != PGNO_INVALID) {
537		if ((ret = __db_lget(dbc,
538		    0, pagep->prev_pgno, DB_LOCK_WRITE, 0, &ppl)) != 0)
539			goto err;
540		if ((ret = __memp_fget(mpf, &pagep->prev_pgno,
541		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pp)) != 0) {
542			ret = __db_pgerr(dbp, pagep->prev_pgno, ret);
543			goto err;
544		}
545		plsnp = &pp->lsn;
546	}
547
548	/* Log the change. */
549	if (DBC_LOGGING(dbc)) {
550		if ((ret = __bam_relink_log(dbp, dbc->txn, &ret_lsn, 0,
551		    pagep->pgno, new_pgno, pagep->prev_pgno, plsnp,
552		    pagep->next_pgno, nlsnp)) != 0)
553			goto err;
554	} else
555		LSN_NOT_LOGGED(ret_lsn);
556	if (np != NULL)
557		np->lsn = ret_lsn;
558	if (pp != NULL)
559		pp->lsn = ret_lsn;
560
561	/*
562	 * Modify and release the two pages.
563	 */
564	if (np != NULL) {
565		if (new_pgno == PGNO_INVALID)
566			np->prev_pgno = pagep->prev_pgno;
567		else
568			np->prev_pgno = new_pgno;
569		ret = __memp_fput(mpf, dbc->thread_info, np, dbc->priority);
570		if ((t_ret = __TLPUT(dbc, npl)) != 0 && ret == 0)
571			ret = t_ret;
572		if (ret != 0)
573			goto err;
574	}
575
576	if (pp != NULL) {
577		if (new_pgno == PGNO_INVALID)
578			pp->next_pgno = pagep->next_pgno;
579		else
580			pp->next_pgno = new_pgno;
581		ret = __memp_fput(mpf, dbc->thread_info, pp, dbc->priority);
582		if ((t_ret = __TLPUT(dbc, ppl)) != 0 && ret == 0)
583			ret = t_ret;
584		if (ret != 0)
585			goto err;
586	}
587	return (0);
588
589err:	if (np != NULL)
590		(void)__memp_fput(mpf, dbc->thread_info, np, dbc->priority);
591	(void)__TLPUT(dbc, npl);
592	if (pp != NULL)
593		(void)__memp_fput(mpf, dbc->thread_info, pp, dbc->priority);
594	(void)__TLPUT(dbc, ppl);
595	return (ret);
596}
597
598/*
599 * __bam_pupdate --
600 *	Update parent key pointers up the tree.
601 *
602 * PUBLIC: int __bam_pupdate __P((DBC *, PAGE *));
603 */
604int
605__bam_pupdate(dbc, lpg)
606	DBC *dbc;
607	PAGE *lpg;
608{
609	BTREE_CURSOR *cp;
610	ENV *env;
611	EPG *epg;
612	int ret;
613
614	env = dbc->env;
615	cp = (BTREE_CURSOR *)dbc->internal;
616	ret = 0;
617
618	/*
619	 * Update the parents up the tree.  __bam_pinsert only looks at the
620	 * left child if is a leaf page, so we don't need to change it.  We
621	 * just do a delete and insert; a replace is possible but reusing
622	 * pinsert is better.
623	 */
624	for (epg = &cp->csp[-1]; epg >= cp->sp; epg--) {
625		if ((ret = __memp_dirty(dbc->dbp->mpf, &epg->page,
626		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
627			return (ret);
628		if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
629			return (ret);
630		epg->indx--;
631		if ((ret = __bam_pinsert(dbc, epg, 0,
632		    lpg, epg[1].page, BPI_NORECNUM)) != 0) {
633			if (ret == DB_NEEDSPLIT) {
634				/* This should not happen. */
635				__db_errx(env,
636				     "Not enough room in parent: %s: page %lu",
637				     dbc->dbp->fname, (u_long)PGNO(epg->page));
638				ret = __env_panic(env, EINVAL);
639			}
640			return (ret);
641		}
642	}
643	return (ret);
644}
645