1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 *	Keith Bostic.  All rights reserved.
9 */
10/*
11 * Copyright (c) 1990, 1993, 1994, 1995
12 *	The Regents of the University of California.  All rights reserved.
13 *
14 * This code is derived from software contributed to Berkeley by
15 * Mike Olson.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 * $Id: db_meta.c,v 12.58 2008/05/07 12:27:32 bschmeck Exp $
42 */
43
44#include "db_config.h"
45
46#include "db_int.h"
47#include "dbinc/db_page.h"
48#include "dbinc/lock.h"
49#include "dbinc/mp.h"
50#include "dbinc/txn.h"
51#include "dbinc/db_am.h"
52
53static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t));
54#ifdef HAVE_FTRUNCATE
55static void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
56static int  __db_pglistcmp __P((const void *, const void *));
57static int  __db_truncate_freelist __P((DBC *, DBMETA *,
58      PAGE *, db_pgno_t *, u_int32_t, u_int32_t));
59#endif
60
61/*
62 * __db_init_meta --
63 *	Helper function for __db_new that initializes the important fields in
64 * a meta-data page (used instead of P_INIT).  We need to make sure that we
65 * retain the page number and LSN of the existing page.
66 */
67static void
68__db_init_meta(dbp, p, pgno, pgtype)
69	DB *dbp;
70	void *p;
71	db_pgno_t pgno;
72	u_int32_t pgtype;
73{
74	DBMETA *meta;
75	DB_LSN save_lsn;
76
77	meta = (DBMETA *)p;
78	save_lsn = meta->lsn;
79	memset(meta, 0, sizeof(DBMETA));
80	meta->lsn = save_lsn;
81	meta->pagesize = dbp->pgsize;
82	if (F_ISSET(dbp, DB_AM_CHKSUM))
83		FLD_SET(meta->metaflags, DBMETA_CHKSUM);
84	meta->pgno = pgno;
85	meta->type = (u_int8_t)pgtype;
86}
87
88/*
89 * __db_new --
90 *	Get a new page, preferably from the freelist.
91 *
92 * PUBLIC: int __db_new __P((DBC *, u_int32_t, PAGE **));
93 */
94int
95__db_new(dbc, type, pagepp)
96	DBC *dbc;
97	u_int32_t type;
98	PAGE **pagepp;
99{
100	DB *dbp;
101	DBMETA *meta;
102	DB_LOCK metalock;
103	DB_LSN lsn;
104	DB_MPOOLFILE *mpf;
105	ENV *env;
106	PAGE *h;
107	db_pgno_t last, *list, pgno, newnext;
108	int extend, ret, t_ret;
109
110	meta = NULL;
111	dbp = dbc->dbp;
112	env = dbp->env;
113	mpf = dbp->mpf;
114	h = NULL;
115	newnext = PGNO_INVALID;
116
117	pgno = PGNO_BASE_MD;
118	if ((ret = __db_lget(dbc,
119	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
120		goto err;
121	if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
122	    DB_MPOOL_DIRTY, &meta)) != 0)
123		goto err;
124	last = meta->last_pgno;
125	if (meta->free == PGNO_INVALID) {
126		if (FLD_ISSET(type, P_DONTEXTEND)) {
127			*pagepp = NULL;
128			goto err;
129		}
130		last = pgno = meta->last_pgno + 1;
131		ZERO_LSN(lsn);
132		extend = 1;
133	} else {
134		pgno = meta->free;
135		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
136		    DB_MPOOL_DIRTY, &h)) != 0)
137			goto err;
138
139		/*
140		 * We want to take the first page off the free list and
141		 * then set meta->free to the that page's next_pgno, but
142		 * we need to log the change first.
143		 */
144		newnext = h->next_pgno;
145		lsn = h->lsn;
146		extend = 0;
147		DB_ASSERT(env, TYPE(h) == P_INVALID);
148
149		if (TYPE(h) != P_INVALID) {
150			__db_errx(env,
151			    "%s page %lu is on free list with type %lu",
152				dbp->fname, (u_long)PGNO(h), (u_long)TYPE(h));
153			return (__env_panic(env, EINVAL));
154		}
155
156	}
157
158	FLD_CLR(type, P_DONTEXTEND);
159
160	/*
161	 * Log the allocation before fetching the new page.  If we
162	 * don't have room in the log then we don't want to tell
163	 * mpool to extend the file.
164	 */
165	if (DBC_LOGGING(dbc)) {
166		if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
167		    &LSN(meta), PGNO_BASE_MD, &lsn,
168		    pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0)
169			goto err;
170	} else
171		LSN_NOT_LOGGED(LSN(meta));
172
173	meta->free = newnext;
174
175	if (extend == 1) {
176		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
177		    DB_MPOOL_NEW, &h)) != 0)
178			goto err;
179		DB_ASSERT(env, last == pgno);
180		meta->last_pgno = pgno;
181		ZERO_LSN(h->lsn);
182		h->pgno = pgno;
183	}
184	LSN(h) = LSN(meta);
185
186	ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
187	meta = NULL;
188	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
189		ret = t_ret;
190	if (ret != 0)
191		goto err;
192
193	switch (type) {
194		case P_BTREEMETA:
195		case P_HASHMETA:
196		case P_QAMMETA:
197			__db_init_meta(dbp, h, h->pgno, type);
198			break;
199		default:
200			P_INIT(h, dbp->pgsize,
201			    h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
202			break;
203	}
204
205	/* Fix up the sorted free list if necessary. */
206#ifdef HAVE_FTRUNCATE
207	if (extend == 0) {
208		u_int32_t nelems = 0;
209
210		if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0)
211			goto err;
212		if (nelems != 0) {
213			DB_ASSERT(env, h->pgno == list[0]);
214			memmove(list, &list[1], (nelems - 1) * sizeof(*list));
215			if ((ret = __memp_extend_freelist(
216			    dbp->mpf, nelems - 1, &list)) != 0)
217				goto err;
218		}
219	}
220#else
221	COMPQUIET(list, NULL);
222#endif
223
224	/*
225	 * If dirty reads are enabled and we are in a transaction, we could
226	 * abort this allocation after the page(s) pointing to this
227	 * one have their locks downgraded.  This would permit dirty readers
228	 * to access this page which is ok, but they must be off the
229	 * page when we abort.  We never lock overflow pages or off page
230	 * duplicate trees.
231	 */
232	if (type != P_OVERFLOW && !F_ISSET(dbc, DBC_OPD) &&
233	     F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) && dbc->txn != NULL) {
234		if ((ret = __db_lget(dbc, 0,
235		    h->pgno, DB_LOCK_WWRITE, 0, &metalock)) != 0)
236			goto err;
237	}
238
239	*pagepp = h;
240	return (0);
241
242err:	if (h != NULL)
243		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
244	if (meta != NULL)
245		(void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
246	(void)__TLPUT(dbc, metalock);
247	return (ret);
248}
249
250/*
251 * __db_free --
252 *	Add a page to the head of the freelist.
253 *
254 * PUBLIC: int __db_free __P((DBC *, PAGE *));
255 */
256int
257__db_free(dbc, h)
258	DBC *dbc;
259	PAGE *h;
260{
261	DB *dbp;
262	DBMETA *meta;
263	DBT ddbt, ldbt;
264	DB_LOCK metalock;
265	DB_LSN *lsnp;
266	DB_MPOOLFILE *mpf;
267	PAGE *prev;
268	db_pgno_t last_pgno, next_pgno, pgno, prev_pgno;
269	u_int32_t lflag;
270	int ret, t_ret;
271#ifdef HAVE_FTRUNCATE
272	db_pgno_t *list, *lp;
273	u_int32_t nelem, position, start;
274	int do_truncate;
275#endif
276
277	dbp = dbc->dbp;
278	mpf = dbp->mpf;
279	prev_pgno = PGNO_INVALID;
280	meta = NULL;
281	prev = NULL;
282#ifdef HAVE_FTRUNCATE
283	lp = NULL;
284	nelem = 0;
285	do_truncate = 0;
286#endif
287
288	/*
289	 * Retrieve the metadata page.  If we are not keeping a sorted
290	 * free list put the page at the head of the the free list.
291	 * If we are keeping a sorted free list, for truncation,
292	 * then figure out where this page belongs and either
293	 * link it in or truncate the file as much as possible.
294	 * If either the lock get or page get routines
295	 * fail, then we need to put the page with which we were called
296	 * back because our caller assumes we take care of it.
297	 */
298	pgno = PGNO_BASE_MD;
299	if ((ret = __db_lget(dbc,
300	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
301		goto err;
302
303	/* If we support truncate, we might not dirty the meta page. */
304	if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
305#ifdef HAVE_FTRUNCATE
306	    0,
307#else
308	    DB_MPOOL_DIRTY,
309#endif
310	    &meta)) != 0)
311		goto err1;
312
313	last_pgno = meta->last_pgno;
314	next_pgno = meta->free;
315	/*
316	 * Assign lsnp here so it always initialized when
317	 * HAVE_FTRUNCATE is not defined.
318	 */
319	lsnp = &LSN(meta);
320
321	DB_ASSERT(dbp->env, h->pgno != next_pgno);
322
323#ifdef HAVE_FTRUNCATE
324	/*
325	 * If we are maintaining a sorted free list see if we either have a
326	 * new truncation point or the page goes somewhere in the middle of
327	 * the list.  If it goes in the middle of the list, we will drop the
328	 * meta page and get the previous page.
329	 */
330	if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
331		goto err1;
332	if (list == NULL)
333		goto no_sort;
334
335	if (h->pgno != last_pgno) {
336		/*
337		 * Put the page number in the sorted list.
338		 * Finds its position and the previous page,
339		 * extend the list, make room and insert.
340		 */
341		position = 0;
342		if (nelem != 0) {
343			__db_freelist_pos(h->pgno, list, nelem, &position);
344
345			DB_ASSERT(dbp->env, h->pgno != list[position]);
346
347			/* Get the previous page if this is not the smallest. */
348			if (position != 0 || h->pgno > list[0])
349				prev_pgno = list[position];
350		}
351
352		/* Put the page number into the list. */
353		if ((ret = __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
354			return (ret);
355		if (prev_pgno != PGNO_INVALID)
356			lp = &list[position + 1];
357		else
358			lp = list;
359		if (nelem != 0 && position != nelem)
360			memmove(lp + 1, lp,
361			    (size_t)((u_int8_t*)&list[nelem] - (u_int8_t*)lp));
362		*lp = h->pgno;
363	} else if (nelem != 0) {
364		/* Find the truncation point. */
365		for (lp = &list[nelem - 1]; lp >= list; lp--)
366			if (--last_pgno != *lp)
367				break;
368		if (lp < list || last_pgno < h->pgno - 1)
369			do_truncate = 1;
370		last_pgno = meta->last_pgno;
371	}
372
373no_sort:
374	if (prev_pgno == PGNO_INVALID) {
375		if ((ret = __memp_dirty(mpf,
376		    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
377			goto err1;
378		lsnp = &LSN(meta);
379	} else {
380		pgno = prev_pgno;
381		if ((ret = __memp_fget(mpf, &pgno,
382		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0)
383			goto err1;
384		next_pgno = NEXT_PGNO(prev);
385		lsnp = &LSN(prev);
386	}
387#endif
388
389	/*
390	 * Log the change.
391	 *	We are either logging an update to the metapage or to the
392	 * previous page in the sorted list.
393	 */
394	if (DBC_LOGGING(dbc)) {
395		memset(&ldbt, 0, sizeof(ldbt));
396		ldbt.data = h;
397		ldbt.size = P_OVERHEAD(dbp);
398		/*
399		 * If we are truncating the file, we need to make sure
400		 * the logging happens before the truncation.  If we
401		 * are truncating multiple pages we don't need to flush the
402		 * log here as it will be flushed by __db_truncate_freelist.
403		 * If we are zeroing pages rather than truncating we still
404		 * need to flush since they will not have valid LSNs.
405		 */
406		lflag = 0;
407
408		if (h->pgno == last_pgno
409#ifdef HAVE_FTRUNCATE
410		    && do_truncate == 0
411#endif
412		)
413			lflag = DB_FLUSH;
414		switch (h->type) {
415		case P_HASH:
416		case P_IBTREE:
417		case P_IRECNO:
418		case P_LBTREE:
419		case P_LRECNO:
420		case P_LDUP:
421			if (h->entries > 0) {
422				ldbt.size += h->entries * sizeof(db_indx_t);
423				ddbt.data = (u_int8_t *)h + HOFFSET(h);
424				ddbt.size = dbp->pgsize - HOFFSET(h);
425				if ((ret = __db_pg_freedata_log(dbp, dbc->txn,
426				     lsnp, lflag,
427				     h->pgno, lsnp, pgno,
428				     &ldbt, next_pgno, last_pgno, &ddbt)) != 0)
429					goto err1;
430				goto logged;
431			}
432			break;
433		case P_HASHMETA:
434			ldbt.size = sizeof(HMETA);
435			break;
436		case P_BTREEMETA:
437			ldbt.size = sizeof(BTMETA);
438			break;
439		case P_OVERFLOW:
440			ldbt.size += OV_LEN(h);
441			break;
442		default:
443			DB_ASSERT(dbp->env, h->type != P_QAMDATA);
444		}
445
446		if ((ret = __db_pg_free_log(dbp,
447		      dbc->txn, lsnp, lflag, h->pgno,
448		      lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0)
449			goto err1;
450	} else
451		LSN_NOT_LOGGED(*lsnp);
452
453logged:
454#ifdef HAVE_FTRUNCATE
455	if (do_truncate) {
456		start = (u_int32_t) (lp - list) + 1;
457		meta->last_pgno--;
458		ret = __db_truncate_freelist(
459		      dbc, meta, h, list, start, nelem);
460		h = NULL;
461	} else
462#endif
463	if (h->pgno == last_pgno) {
464		LSN(h) = *lsnp;
465		P_INIT(h, dbp->pgsize,
466		    h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
467		if ((ret = __memp_fput(mpf,
468		    dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
469			goto err1;
470		h = NULL;
471		/* Give the page back to the OS. */
472		if ((ret =
473		    __memp_ftruncate(mpf, dbc->thread_info, last_pgno, 0)) != 0)
474			goto err1;
475		DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD);
476		meta->last_pgno--;
477		h = NULL;
478	} else {
479		/*
480		 * If we are not truncating the page then we
481		 * reinitialize it and put it at the head of
482		 * the free list.
483		 */
484		if ((ret = __memp_dirty(mpf,
485		    &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
486			goto err1;
487		LSN(h) = *lsnp;
488		P_INIT(h, dbp->pgsize,
489		    h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
490#ifdef DIAGNOSTIC
491		memset((u_int8_t *) h + P_OVERHEAD(dbp),
492		    CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp));
493#endif
494		if (prev_pgno == PGNO_INVALID)
495			meta->free = h->pgno;
496		else
497			NEXT_PGNO(prev) = h->pgno;
498	}
499
500	/* Discard the metadata or previous page. */
501err1:	if (meta != NULL && (t_ret = __memp_fput(mpf,
502	    dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
503		ret = t_ret;
504	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
505		ret = t_ret;
506	if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf,
507	    dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0)
508		ret = t_ret;
509
510	/* Discard the caller's page reference. */
511err:	if (h != NULL && (t_ret = __memp_fput(mpf,
512	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
513		ret = t_ret;
514
515	/*
516	 * XXX
517	 * We have to unlock the caller's page in the caller!
518	 */
519	return (ret);
520}
521
522#ifdef HAVE_FTRUNCATE
523/*
524 * __db_freelist_pos -- find the position of a page in the freelist.
525 *	The list is sorted, we do a binary search.
526 *
527 * PUBLIC: #ifdef HAVE_FTRUNCATE
528 * PUBLIC: void __db_freelist_pos __P((db_pgno_t,
529 * PUBLIC:       db_pgno_t *, u_int32_t, u_int32_t *));
530 * PUBLIC: #endif
531 */
532void
533__db_freelist_pos(pgno, list, nelem, posp)
534	db_pgno_t pgno;
535	db_pgno_t *list;
536	u_int32_t nelem;
537	u_int32_t *posp;
538{
539	u_int32_t base, indx, lim;
540
541	indx = 0;
542	for (base = 0, lim = nelem; lim != 0; lim >>= 1) {
543		indx = base + (lim >> 1);
544		if (pgno == list[indx]) {
545			*posp = indx;
546			return;
547		}
548		if (pgno > list[indx]) {
549			base = indx + 1;
550			--lim;
551		}
552	}
553	if (base != 0)
554		base--;
555	*posp = base;
556	return;
557}
558
559static int
560__db_pglistcmp(a, b)
561	const void *a, *b;
562{
563	db_pglist_t *ap, *bp;
564
565	ap = (db_pglist_t *)a;
566	bp = (db_pglist_t *)b;
567
568	return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0);
569}
570
571/*
572 * __db_freelist_sort -- sort a list of free pages.
573 */
574static void
575__db_freelist_sort(list, nelems)
576	db_pglist_t *list;
577	u_int32_t nelems;
578{
579	qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp);
580}
581
582/*
583 * __db_pg_truncate -- sort the freelist and find the truncation point.
584 *
585 * PUBLIC: #ifdef HAVE_FTRUNCATE
586 * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *,
587 * PUBLIC:    db_pglist_t *list, DB_COMPACT *, u_int32_t *, db_pgno_t *,
588 * PUBLIC:    DB_LSN *, int));
589 * PUBLIC: #endif
590 */
591int
592__db_pg_truncate(dbc, txn, list, c_data, nelemp, last_pgno, lsnp, in_recovery)
593	DBC *dbc;
594	DB_TXN *txn;
595	db_pglist_t *list;
596	DB_COMPACT *c_data;
597	u_int32_t *nelemp;
598	db_pgno_t *last_pgno;
599	DB_LSN *lsnp;
600	int in_recovery;
601{
602	DB *dbp;
603	DB_MPOOLFILE *mpf;
604	PAGE *h;
605	db_pglist_t *lp;
606	db_pgno_t pgno;
607	u_int32_t nelems;
608	int ret;
609
610	ret = 0;
611
612	dbp = dbc->dbp;
613	mpf = dbp->mpf;
614	nelems = *nelemp;
615	/* Sort the list */
616	__db_freelist_sort(list, nelems);
617
618	/* Find the truncation point. */
619	pgno = *last_pgno;
620	lp = &list[nelems - 1];
621	while (nelems != 0) {
622		if (lp->pgno != pgno)
623			break;
624		pgno--;
625		nelems--;
626		lp--;
627	}
628
629	/*
630	 * Figure out what (if any) pages can be truncated immediately and
631	 * record the place from which we can truncate, so we can do the
632	 * memp_ftruncate below.  We also use this to avoid ever putting
633	 * these pages on the freelist, which we are about to relink.
634	 */
635	for (lp = list; lp < &list[nelems]; lp++) {
636		if ((ret = __memp_fget(
637		    mpf, &lp->pgno, dbc->thread_info, txn, 0, &h)) != 0) {
638			/* Page may have been truncated later. */
639			if (in_recovery && ret == DB_PAGE_NOTFOUND) {
640				ret = 0;
641				continue;
642			}
643			goto err;
644		}
645		if (!in_recovery || LOG_COMPARE(&LSN(h), &lp->lsn) == 0) {
646			if ((ret = __memp_dirty(mpf, &h,
647			    dbc->thread_info, txn, dbp->priority, 0)) != 0) {
648				(void)__memp_fput(mpf,
649				    dbc->thread_info, h, dbp->priority);
650				goto err;
651			}
652			if (lp == &list[nelems - 1])
653				NEXT_PGNO(h) = PGNO_INVALID;
654			else
655				NEXT_PGNO(h) = lp[1].pgno;
656			DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno);
657
658			LSN(h) = *lsnp;
659		}
660		if ((ret = __memp_fput(mpf,
661		    dbc->thread_info, h, dbp->priority)) != 0)
662			goto err;
663	}
664
665	if (pgno != *last_pgno) {
666		if ((ret = __memp_ftruncate(mpf, dbc->thread_info,
667		    pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0)
668			goto err;
669		if (c_data)
670			c_data->compact_pages_truncated += *last_pgno - pgno;
671		*last_pgno = pgno;
672	}
673	*nelemp = nelems;
674
675err:	return (ret);
676}
677
678/*
679 * __db_free_truncate --
680 *	Truncate free pages at the end of the file.
681 *
682 * PUBLIC: #ifdef HAVE_FTRUNCATE
683 * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
684 * PUBLIC:    u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *,
685 * PUBLIC:    db_pgno_t *));
686 * PUBLIC: #endif
687 */
688int
689__db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop)
690	DB *dbp;
691	DB_THREAD_INFO *ip;
692	DB_TXN *txn;
693	u_int32_t flags;
694	DB_COMPACT *c_data;
695	db_pglist_t **listp;
696	u_int32_t *nelemp;
697	db_pgno_t *last_pgnop;
698{
699	DBC *dbc;
700	DBMETA *meta;
701	DBT ddbt;
702	DB_LOCK metalock;
703	DB_LSN null_lsn;
704	DB_MPOOLFILE *mpf;
705	ENV *env;
706	PAGE *h;
707	db_pglist_t *list, *lp;
708	db_pgno_t pgno;
709	u_int32_t nelems;
710	int ret, t_ret;
711	size_t size;
712
713	COMPQUIET(flags, 0);
714	list = NULL;
715	meta = NULL;
716	env = dbp->env;
717	mpf = dbp->mpf;
718	h = NULL;
719	nelems = 0;
720	if (listp != NULL) {
721		*listp = NULL;
722		DB_ASSERT(env, nelemp != NULL);
723		*nelemp = 0;
724	}
725
726	if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0)
727		return (ret);
728
729	pgno = PGNO_BASE_MD;
730	if ((ret = __db_lget(dbc,
731	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
732		goto err;
733	if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0,
734	    &meta)) != 0)
735		goto err;
736
737	if (last_pgnop != NULL)
738		*last_pgnop = meta->last_pgno;
739	if ((pgno = meta->free) == PGNO_INVALID)
740		goto done;
741
742	size = 128;
743	if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0)
744		goto err;
745	lp = list;
746
747	do {
748		if (lp == &list[size]) {
749			size *= 2;
750			if ((ret = __os_realloc(env,
751			    size * sizeof(*list), &list)) != 0)
752				goto err;
753			lp = &list[size / 2];
754		}
755		if ((ret = __memp_fget(mpf, &pgno,
756		     dbc->thread_info, dbc->txn, 0, &h)) != 0)
757			goto err;
758
759		lp->pgno = pgno;
760		lp->lsn = LSN(h);
761		pgno = NEXT_PGNO(h);
762		if ((ret = __memp_fput(mpf,
763		    dbc->thread_info, h, dbc->priority)) != 0)
764			goto err;
765		lp++;
766	} while (pgno != PGNO_INVALID);
767	nelems = (u_int32_t)(lp - list);
768
769	if ((ret = __memp_dirty(mpf,
770	    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
771		goto err;
772
773	/* Log the current state of the free list */
774	if (DBC_LOGGING(dbc)) {
775		ddbt.data = list;
776		ddbt.size = nelems * sizeof(*lp);
777		ZERO_LSN(null_lsn);
778		if ((ret = __db_pg_sort_log(dbp,
779		     dbc->txn, &LSN(meta), DB_FLUSH, PGNO_BASE_MD, &LSN(meta),
780		     PGNO_INVALID, &null_lsn, meta->last_pgno, &ddbt)) != 0)
781			goto err;
782	} else
783		LSN_NOT_LOGGED(LSN(meta));
784
785	if ((ret = __db_pg_truncate(dbc, txn, list, c_data,
786	    &nelems, &meta->last_pgno, &LSN(meta), 0)) != 0)
787		goto err;
788
789	if (nelems == 0)
790		meta->free = PGNO_INVALID;
791	else
792		meta->free = list[0].pgno;
793
794done:	if (last_pgnop != NULL)
795		*last_pgnop = meta->last_pgno;
796
797	/*
798	 * The truncate point is the number of pages in the free
799	 * list back from the last page.  The number of pages
800	 * in the free list are the number that we can swap in.
801	 */
802	if (c_data)
803		c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
804
805	if (nelems != 0 && listp != NULL) {
806		*listp = list;
807		*nelemp = nelems;
808		list = NULL;
809	}
810
811err:	if (list != NULL)
812		__os_free(env, list);
813	if (meta != NULL && (t_ret = __memp_fput(mpf,
814	     dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
815		ret = t_ret;
816	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
817		ret = t_ret;
818	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
819		ret = t_ret;
820	return (ret);
821}
822
823static int
824__db_truncate_freelist(dbc, meta, h, list, start, nelem)
825	DBC *dbc;
826	DBMETA *meta;
827	PAGE *h;
828	db_pgno_t *list;
829	u_int32_t start, nelem;
830{
831	DB *dbp;
832	DBT ddbt;
833	DB_LSN null_lsn;
834	DB_MPOOLFILE *mpf;
835	PAGE *last_free, *pg;
836	db_pgno_t *lp;
837	db_pglist_t *plist, *pp;
838	int ret;
839
840	dbp = dbc->dbp;
841	mpf = dbp->mpf;
842	plist = NULL;
843	last_free = NULL;
844
845	if (start != 0 &&
846	    (ret = __memp_fget(mpf, &list[start - 1],
847	    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0)
848		goto err;
849
850	if (DBC_LOGGING(dbc)) {
851		if ((ret = __os_malloc(dbp->env,
852		     (nelem - start) * sizeof(*pp), &plist)) != 0)
853			goto err;
854
855		pp = plist;
856		for (lp = &list[start]; lp < &list[nelem]; lp++) {
857			pp->pgno = *lp;
858			if ((ret = __memp_fget(mpf, lp,
859			     dbc->thread_info, dbc->txn, 0, &pg)) != 0)
860				goto err;
861			pp->lsn = LSN(pg);
862			if ((ret = __memp_fput(mpf,
863			    dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
864				goto err;
865			pp++;
866		}
867		ddbt.data = plist;
868		ddbt.size = (nelem - start) * sizeof(*pp);
869		ZERO_LSN(null_lsn);
870		if (last_free != NULL) {
871			if ((ret = __db_pg_sort_log(dbp, dbc->txn, &LSN(meta),
872			     DB_FLUSH, PGNO(meta), &LSN(meta), PGNO(last_free),
873			     &LSN(last_free), meta->last_pgno, &ddbt)) != 0)
874				goto err;
875		} else if ((ret = __db_pg_sort_log(dbp, dbc->txn,
876		     &LSN(meta), DB_FLUSH, PGNO(meta), &LSN(meta),
877		     PGNO_INVALID, &null_lsn, meta->last_pgno, &ddbt)) != 0)
878			goto err;
879	} else
880		LSN_NOT_LOGGED(LSN(meta));
881	if (last_free != NULL)
882		LSN(last_free) = LSN(meta);
883
884	if ((ret = __memp_fput(mpf,
885	    dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
886		goto err;
887	h = NULL;
888	if ((ret =
889	    __memp_ftruncate(mpf, dbc->thread_info, list[start], 0)) != 0)
890		goto err;
891	meta->last_pgno = list[start] - 1;
892
893	if (start == 0)
894		meta->free = PGNO_INVALID;
895	else {
896		NEXT_PGNO(last_free) = PGNO_INVALID;
897		if ((ret = __memp_fput(mpf,
898		    dbc->thread_info, last_free, dbc->priority)) != 0)
899			goto err;
900		last_free = NULL;
901	}
902
903	/* Shrink the number of elements in the list. */
904	ret = __memp_extend_freelist(mpf, start, &list);
905
906err:	if (plist != NULL)
907		__os_free(dbp->env, plist);
908
909	/* We need to put the page on error. */
910	if (h != NULL)
911		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
912	if (last_free != NULL)
913		(void)__memp_fput(mpf,
914		    dbc->thread_info, last_free, dbc->priority);
915
916	return (ret);
917}
918#endif
919
920#ifdef DEBUG
921/*
922 * __db_lprint --
923 *	Print out the list of locks currently held by a cursor.
924 *
925 * PUBLIC: int __db_lprint __P((DBC *));
926 */
927int
928__db_lprint(dbc)
929	DBC *dbc;
930{
931	DB *dbp;
932	DB_LOCKREQ req;
933	ENV *env;
934
935	dbp = dbc->dbp;
936	env = dbp->env;
937
938	if (LOCKING_ON(env)) {
939		req.op = DB_LOCK_DUMP;
940		(void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL);
941	}
942	return (0);
943}
944#endif
945
946/*
947 * __db_lget --
948 *	The standard lock get call.
949 *
950 * PUBLIC: int __db_lget __P((DBC *,
951 * PUBLIC:     int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
952 */
953int
954__db_lget(dbc, action, pgno, mode, lkflags, lockp)
955	DBC *dbc;
956	int action;
957	db_pgno_t pgno;
958	db_lockmode_t mode;
959	u_int32_t lkflags;
960	DB_LOCK *lockp;
961{
962	DB *dbp;
963	DB_LOCKREQ couple[3], *reqp;
964	DB_TXN *txn;
965	ENV *env;
966	int has_timeout, i, ret;
967
968	dbp = dbc->dbp;
969	env = dbp->env;
970	txn = dbc->txn;
971
972	/*
973	 * We do not always check if we're configured for locking before
974	 * calling __db_lget to acquire the lock.
975	 */
976	if (CDB_LOCKING(env) || !LOCKING_ON(env) ||
977	    (MULTIVERSION(dbp) && mode == DB_LOCK_READ &&
978	    dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) ||
979	    F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) &&
980	    (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) ||
981	    (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
982		LOCK_INIT(*lockp);
983		return (0);
984	}
985
986	dbc->lock.pgno = pgno;
987	if (lkflags & DB_LOCK_RECORD)
988		dbc->lock.type = DB_RECORD_LOCK;
989	else
990		dbc->lock.type = DB_PAGE_LOCK;
991	lkflags &= ~DB_LOCK_RECORD;
992
993	/*
994	 * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set,
995	 * pass that along to the lock call.
996	 */
997	if (DB_NONBLOCK(dbc))
998		lkflags |= DB_LOCK_NOWAIT;
999
1000	if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ)
1001		mode = DB_LOCK_READ_UNCOMMITTED;
1002
1003	has_timeout = F_ISSET(dbc, DBC_RECOVER) ||
1004	    (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT));
1005
1006	/*
1007	 * Transactional locking.
1008	 * Hold on to the previous read lock only if we are in full isolation.
1009	 * COUPLE_ALWAYS indicates we are holding an interior node which need
1010	 *	not be isolated.
1011	 * Downgrade write locks if we are supporting dirty readers.
1012	 */
1013	if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) ||
1014	    !LOCK_ISSET(*lockp))
1015		action = 0;
1016	else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS)
1017		action = LCK_COUPLE;
1018	else if (F_ISSET(dbc,
1019	    DBC_READ_COMMITTED) && lockp->mode == DB_LOCK_READ)
1020		action = LCK_COUPLE;
1021	else if (F_ISSET(dbc,
1022	    DBC_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_READ_UNCOMMITTED)
1023		action = LCK_COUPLE;
1024	else if (F_ISSET(dbc->dbp,
1025	    DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE)
1026		action = LCK_DOWNGRADE;
1027	else
1028		action = 0;
1029
1030	i = 0;
1031	switch (action) {
1032	default:
1033		if (has_timeout)
1034			goto do_couple;
1035		ret = __lock_get(env,
1036		    dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
1037		break;
1038
1039	case LCK_DOWNGRADE:
1040		couple[0].op = DB_LOCK_GET;
1041		couple[0].obj = NULL;
1042		couple[0].lock = *lockp;
1043		couple[0].mode = DB_LOCK_WWRITE;
1044		UMRW_SET(couple[0].timeout);
1045		i++;
1046		/* FALLTHROUGH */
1047	case LCK_COUPLE:
1048do_couple:	couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
1049		couple[i].obj = &dbc->lock_dbt;
1050		couple[i].mode = mode;
1051		UMRW_SET(couple[i].timeout);
1052		i++;
1053		if (has_timeout)
1054			couple[0].timeout =
1055			     F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout;
1056		if (action == LCK_COUPLE || action == LCK_DOWNGRADE) {
1057			couple[i].op = DB_LOCK_PUT;
1058			couple[i].lock = *lockp;
1059			i++;
1060		}
1061
1062		ret = __lock_vec(env,
1063		    dbc->locker, lkflags, couple, i, &reqp);
1064		if (ret == 0 || reqp == &couple[i - 1])
1065			*lockp = i == 1 ? couple[0].lock : couple[i - 2].lock;
1066		break;
1067	}
1068
1069	if (txn != NULL && ret == DB_LOCK_DEADLOCK)
1070		F_SET(txn, TXN_DEADLOCK);
1071	return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
1072		 DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
1073}
1074
1075/*
1076 * __db_lput --
1077 *	The standard lock put call.
1078 *
1079 * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
1080 */
1081int
1082__db_lput(dbc, lockp)
1083	DBC *dbc;
1084	DB_LOCK *lockp;
1085{
1086	DB_LOCKREQ couple[2], *reqp;
1087	ENV *env;
1088	int action, ret;
1089
1090	/*
1091	 * Transactional locking.
1092	 * Hold on to the read locks only if we are in full isolation.
1093	 * Downgrade write locks if we are supporting dirty readers.
1094	 */
1095	if (F_ISSET(dbc->dbp,
1096	    DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE)
1097		action = LCK_DOWNGRADE;
1098	else if (dbc->txn == NULL)
1099		action = LCK_COUPLE;
1100	else if (F_ISSET(dbc,
1101	    DBC_READ_COMMITTED) && lockp->mode == DB_LOCK_READ)
1102		action = LCK_COUPLE;
1103	else if (F_ISSET(dbc,
1104	    DBC_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_READ_UNCOMMITTED)
1105		action = LCK_COUPLE;
1106	else
1107		action = 0;
1108
1109	env = dbc->env;
1110	switch (action) {
1111	case LCK_COUPLE:
1112		ret = __lock_put(env, lockp);
1113		break;
1114	case LCK_DOWNGRADE:
1115		couple[0].op = DB_LOCK_GET;
1116		couple[0].obj = NULL;
1117		couple[0].mode = DB_LOCK_WWRITE;
1118		couple[0].lock = *lockp;
1119		UMRW_SET(couple[0].timeout);
1120		couple[1].op = DB_LOCK_PUT;
1121		couple[1].lock = *lockp;
1122		ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp);
1123		if (ret == 0 || reqp == &couple[1])
1124			*lockp = couple[0].lock;
1125		break;
1126	default:
1127		ret = 0;
1128		break;
1129	}
1130
1131	return (ret);
1132}
1133