1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: bt_stat.c,v 12.22 2008/03/11 21:07:19 mbrey Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/btree.h"
14#include "dbinc/lock.h"
15#include "dbinc/mp.h"
16
17#ifdef HAVE_STATISTICS
18/*
19 * __bam_stat --
20 *	Gather/print the btree statistics
21 *
22 * PUBLIC: int __bam_stat __P((DBC *, void *, u_int32_t));
23 */
24int
25__bam_stat(dbc, spp, flags)
26	DBC *dbc;
27	void *spp;
28	u_int32_t flags;
29{
30	BTMETA *meta;
31	BTREE *t;
32	BTREE_CURSOR *cp;
33	DB *dbp;
34	DB_BTREE_STAT *sp;
35	DB_LOCK lock, metalock;
36	DB_MPOOLFILE *mpf;
37	ENV *env;
38	PAGE *h;
39	db_pgno_t pgno;
40	int ret, t_ret, write_meta;
41
42	dbp = dbc->dbp;
43	env = dbp->env;
44
45	meta = NULL;
46	t = dbp->bt_internal;
47	sp = NULL;
48	LOCK_INIT(metalock);
49	LOCK_INIT(lock);
50	mpf = dbp->mpf;
51	h = NULL;
52	ret = write_meta = 0;
53
54	cp = (BTREE_CURSOR *)dbc->internal;
55
56	/* Allocate and clear the structure. */
57	if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0)
58		goto err;
59	memset(sp, 0, sizeof(*sp));
60
61	/* Get the metadata page for the entire database. */
62	pgno = PGNO_BASE_MD;
63	if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &metalock)) != 0)
64		goto err;
65	if ((ret = __memp_fget(mpf, &pgno,
66	     dbc->thread_info, dbc->txn, 0, &meta)) != 0)
67		goto err;
68
69	if (flags == DB_FAST_STAT)
70		goto meta_only;
71
72	/* Walk the metadata free list, counting pages. */
73	for (sp->bt_free = 0, pgno = meta->dbmeta.free; pgno != PGNO_INVALID;) {
74		++sp->bt_free;
75
76		if ((ret = __memp_fget(mpf, &pgno,
77		     dbc->thread_info, dbc->txn, 0, &h)) != 0)
78			goto err;
79
80		pgno = h->next_pgno;
81		if ((ret = __memp_fput(mpf,
82		    dbc->thread_info, h, dbc->priority)) != 0)
83			goto err;
84		h = NULL;
85	}
86
87	/* Get the root page. */
88	pgno = cp->root;
89	if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
90		goto err;
91	if ((ret = __memp_fget(mpf, &pgno,
92	     dbc->thread_info, dbc->txn, 0, &h)) != 0)
93		goto err;
94
95	/* Get the levels from the root page. */
96	sp->bt_levels = h->level;
97
98	/* Discard the root page. */
99	ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
100	h = NULL;
101	if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
102		ret = t_ret;
103	if (ret != 0)
104		goto err;
105
106	/* Walk the tree. */
107	if ((ret = __bam_traverse(dbc,
108	    DB_LOCK_READ, cp->root, __bam_stat_callback, sp)) != 0)
109		goto err;
110
111	/*
112	 * Get the subdatabase metadata page if it's not the same as the
113	 * one we already have.
114	 */
115	write_meta = !F_ISSET(dbp, DB_AM_RDONLY) &&
116	    (!MULTIVERSION(dbp) || dbc->txn != NULL);
117meta_only:
118	if (t->bt_meta != PGNO_BASE_MD || write_meta) {
119		ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
120		meta = NULL;
121		if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
122			ret = t_ret;
123		if (ret != 0)
124			goto err;
125
126		if ((ret = __db_lget(dbc,
127		    0, t->bt_meta, write_meta ? DB_LOCK_WRITE : DB_LOCK_READ,
128		    0, &metalock)) != 0)
129			goto err;
130		if ((ret = __memp_fget(mpf, &t->bt_meta,
131		     dbc->thread_info, dbc->txn,
132		    write_meta ? DB_MPOOL_DIRTY : 0, &meta)) != 0)
133			goto err;
134	}
135	if (flags == DB_FAST_STAT) {
136		if (dbp->type == DB_RECNO ||
137		    (dbp->type == DB_BTREE && F_ISSET(dbp, DB_AM_RECNUM))) {
138			if ((ret = __db_lget(dbc, 0,
139			    cp->root, DB_LOCK_READ, 0, &lock)) != 0)
140				goto err;
141			if ((ret = __memp_fget(mpf, &cp->root,
142			     dbc->thread_info, dbc->txn, 0, &h)) != 0)
143				goto err;
144
145			sp->bt_nkeys = RE_NREC(h);
146		} else
147			sp->bt_nkeys = meta->dbmeta.key_count;
148
149		sp->bt_ndata = dbp->type == DB_RECNO ?
150		   sp->bt_nkeys : meta->dbmeta.record_count;
151	}
152
153	/* Get metadata page statistics. */
154	sp->bt_metaflags = meta->dbmeta.flags;
155	sp->bt_minkey = meta->minkey;
156	sp->bt_re_len = meta->re_len;
157	sp->bt_re_pad = meta->re_pad;
158	/*
159	 * Don't take the page number from the meta-data page -- that value is
160	 * only maintained in the primary database, we may have been called on
161	 * a subdatabase.  (Yes, I read the primary database meta-data page
162	 * earlier in this function, but I'm asking the underlying cache so the
163	 * code for the Hash and Btree methods is the same.)
164	 */
165	if ((ret = __memp_get_last_pgno(dbp->mpf, &pgno)) != 0)
166		goto err;
167	sp->bt_pagecnt = pgno + 1;
168	sp->bt_pagesize = meta->dbmeta.pagesize;
169	sp->bt_magic = meta->dbmeta.magic;
170	sp->bt_version = meta->dbmeta.version;
171
172	if (write_meta != 0) {
173		meta->dbmeta.key_count = sp->bt_nkeys;
174		meta->dbmeta.record_count = sp->bt_ndata;
175	}
176
177	*(DB_BTREE_STAT **)spp = sp;
178
179err:	/* Discard the second page. */
180	if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
181		ret = t_ret;
182	if (h != NULL && (t_ret = __memp_fput(mpf,
183	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
184		ret = t_ret;
185
186	/* Discard the metadata page. */
187	if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
188		ret = t_ret;
189	if (meta != NULL && (t_ret = __memp_fput(mpf,
190	    dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
191		ret = t_ret;
192
193	if (ret != 0 && sp != NULL) {
194		__os_ufree(env, sp);
195		*(DB_BTREE_STAT **)spp = NULL;
196	}
197
198	return (ret);
199}
200
201/*
202 * __bam_stat_print --
203 *	Display btree/recno statistics.
204 *
205 * PUBLIC: int __bam_stat_print __P((DBC *, u_int32_t));
206 */
207int
208__bam_stat_print(dbc, flags)
209	DBC *dbc;
210	u_int32_t flags;
211{
212	static const FN fn[] = {
213		{ BTM_DUP,	"duplicates" },
214		{ BTM_RECNO,	"recno" },
215		{ BTM_RECNUM,	"record-numbers" },
216		{ BTM_FIXEDLEN,	"fixed-length" },
217		{ BTM_RENUMBER,	"renumber" },
218		{ BTM_SUBDB,	"multiple-databases" },
219		{ BTM_DUPSORT,	"sorted duplicates" },
220		{ 0,		NULL }
221	};
222	DB *dbp;
223	DB_BTREE_STAT *sp;
224	ENV *env;
225	int lorder, ret;
226	const char *s;
227
228	dbp = dbc->dbp;
229	env = dbp->env;
230
231	if ((ret = __bam_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0)
232		return (ret);
233
234	if (LF_ISSET(DB_STAT_ALL)) {
235		__db_msg(env, "%s", DB_GLOBAL(db_line));
236		__db_msg(env, "Default Btree/Recno database information:");
237	}
238
239	__db_msg(env, "%lx\tBtree magic number", (u_long)sp->bt_magic);
240	__db_msg(env, "%lu\tBtree version number", (u_long)sp->bt_version);
241
242	(void)__db_get_lorder(dbp, &lorder);
243	switch (lorder) {
244	case 1234:
245		s = "Little-endian";
246		break;
247	case 4321:
248		s = "Big-endian";
249		break;
250	default:
251		s = "Unrecognized byte order";
252		break;
253	}
254	__db_msg(env, "%s\tByte order", s);
255	__db_prflags(env, NULL, sp->bt_metaflags, fn, NULL, "\tFlags");
256	if (dbp->type == DB_BTREE)
257		__db_dl(env, "Minimum keys per-page", (u_long)sp->bt_minkey);
258	if (dbp->type == DB_RECNO) {
259		__db_dl(env,
260		    "Fixed-length record size", (u_long)sp->bt_re_len);
261		__db_msg(env,
262		    "%#x\tFixed-length record pad", (u_int)sp->bt_re_pad);
263	}
264	__db_dl(env,
265	    "Underlying database page size", (u_long)sp->bt_pagesize);
266	if (dbp->type == DB_BTREE)
267		__db_dl(env, "Overflow key/data size",
268		    ((BTREE_CURSOR *)dbc->internal)->ovflsize);
269	__db_dl(env, "Number of levels in the tree", (u_long)sp->bt_levels);
270	__db_dl(env, dbp->type == DB_BTREE ?
271	    "Number of unique keys in the tree" :
272	    "Number of records in the tree", (u_long)sp->bt_nkeys);
273	__db_dl(env,
274	    "Number of data items in the tree", (u_long)sp->bt_ndata);
275
276	__db_dl(env,
277	    "Number of tree internal pages", (u_long)sp->bt_int_pg);
278	__db_dl_pct(env,
279	    "Number of bytes free in tree internal pages",
280	    (u_long)sp->bt_int_pgfree,
281	    DB_PCT_PG(sp->bt_int_pgfree, sp->bt_int_pg, sp->bt_pagesize), "ff");
282
283	__db_dl(env,
284	    "Number of tree leaf pages", (u_long)sp->bt_leaf_pg);
285	__db_dl_pct(env, "Number of bytes free in tree leaf pages",
286	    (u_long)sp->bt_leaf_pgfree, DB_PCT_PG(
287	    sp->bt_leaf_pgfree, sp->bt_leaf_pg, sp->bt_pagesize), "ff");
288
289	__db_dl(env,
290	    "Number of tree duplicate pages", (u_long)sp->bt_dup_pg);
291	__db_dl_pct(env,
292	    "Number of bytes free in tree duplicate pages",
293	    (u_long)sp->bt_dup_pgfree,
294	    DB_PCT_PG(sp->bt_dup_pgfree, sp->bt_dup_pg, sp->bt_pagesize), "ff");
295
296	__db_dl(env,
297	    "Number of tree overflow pages", (u_long)sp->bt_over_pg);
298	__db_dl_pct(env, "Number of bytes free in tree overflow pages",
299	    (u_long)sp->bt_over_pgfree, DB_PCT_PG(
300	    sp->bt_over_pgfree, sp->bt_over_pg, sp->bt_pagesize), "ff");
301	__db_dl(env, "Number of empty pages", (u_long)sp->bt_empty_pg);
302
303	__db_dl(env, "Number of pages on the free list", (u_long)sp->bt_free);
304
305	__os_ufree(env, sp);
306
307	return (0);
308}
309
310/*
311 * __bam_stat_callback --
312 *	Statistics callback.
313 *
314 * PUBLIC: int __bam_stat_callback __P((DBC *, PAGE *, void *, int *));
315 */
316int
317__bam_stat_callback(dbc, h, cookie, putp)
318	DBC *dbc;
319	PAGE *h;
320	void *cookie;
321	int *putp;
322{
323	DB *dbp;
324	DB_BTREE_STAT *sp;
325	db_indx_t indx, *inp, top;
326	u_int8_t type;
327
328	dbp = dbc->dbp;
329	sp = cookie;
330	*putp = 0;
331	top = NUM_ENT(h);
332	inp = P_INP(dbp, h);
333
334	switch (TYPE(h)) {
335	case P_IBTREE:
336	case P_IRECNO:
337		++sp->bt_int_pg;
338		sp->bt_int_pgfree += P_FREESPACE(dbp, h);
339		break;
340	case P_LBTREE:
341		if (top == 0)
342			++sp->bt_empty_pg;
343
344		/* Correct for on-page duplicates and deleted items. */
345		for (indx = 0; indx < top; indx += P_INDX) {
346			type = GET_BKEYDATA(dbp, h, indx + O_INDX)->type;
347			/* Ignore deleted items. */
348			if (B_DISSET(type))
349				continue;
350
351			/* Ignore duplicate keys. */
352			if (indx + P_INDX >= top ||
353			    inp[indx] != inp[indx + P_INDX])
354				++sp->bt_nkeys;
355
356			/* Ignore off-page duplicates. */
357			if (B_TYPE(type) != B_DUPLICATE)
358				++sp->bt_ndata;
359		}
360
361		++sp->bt_leaf_pg;
362		sp->bt_leaf_pgfree += P_FREESPACE(dbp, h);
363		break;
364	case P_LRECNO:
365		if (top == 0)
366			++sp->bt_empty_pg;
367
368		/*
369		 * If walking a recno tree, then each of these items is a key.
370		 * Otherwise, we're walking an off-page duplicate set.
371		 */
372		if (dbp->type == DB_RECNO) {
373			/*
374			 * Correct for deleted items in non-renumbering Recno
375			 * databases.
376			 */
377			if (F_ISSET(dbp, DB_AM_RENUMBER)) {
378				sp->bt_nkeys += top;
379				sp->bt_ndata += top;
380			} else
381				for (indx = 0; indx < top; indx += O_INDX) {
382					type = GET_BKEYDATA(dbp, h, indx)->type;
383					if (!B_DISSET(type)) {
384						++sp->bt_ndata;
385						++sp->bt_nkeys;
386					}
387				}
388
389			++sp->bt_leaf_pg;
390			sp->bt_leaf_pgfree += P_FREESPACE(dbp, h);
391		} else {
392			sp->bt_ndata += top;
393
394			++sp->bt_dup_pg;
395			sp->bt_dup_pgfree += P_FREESPACE(dbp, h);
396		}
397		break;
398	case P_LDUP:
399		if (top == 0)
400			++sp->bt_empty_pg;
401
402		/* Correct for deleted items. */
403		for (indx = 0; indx < top; indx += O_INDX)
404			if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
405				++sp->bt_ndata;
406
407		++sp->bt_dup_pg;
408		sp->bt_dup_pgfree += P_FREESPACE(dbp, h);
409		break;
410	case P_OVERFLOW:
411		++sp->bt_over_pg;
412		sp->bt_over_pgfree += P_OVFLSPACE(dbp, dbp->pgsize, h);
413		break;
414	default:
415		return (__db_pgfmt(dbp->env, h->pgno));
416	}
417	return (0);
418}
419
420/*
421 * __bam_print_cursor --
422 *	Display the current internal cursor.
423 *
424 * PUBLIC: void __bam_print_cursor __P((DBC *));
425 */
426void
427__bam_print_cursor(dbc)
428	DBC *dbc;
429{
430	static const FN fn[] = {
431		{ C_DELETED,	"C_DELETED" },
432		{ C_RECNUM,	"C_RECNUM" },
433		{ C_RENUMBER,	"C_RENUMBER" },
434		{ 0,		NULL }
435	};
436	ENV *env;
437	BTREE_CURSOR *cp;
438
439	env = dbc->env;
440	cp = (BTREE_CURSOR *)dbc->internal;
441
442	STAT_ULONG("Overflow size", cp->ovflsize);
443	if (dbc->dbtype == DB_RECNO)
444		STAT_ULONG("Recno", cp->recno);
445	STAT_ULONG("Order", cp->order);
446	__db_prflags(env, NULL, cp->flags, fn, NULL, "\tInternal Flags");
447}
448
449#else /* !HAVE_STATISTICS */
450
451int
452__bam_stat(dbc, spp, flags)
453	DBC *dbc;
454	void *spp;
455	u_int32_t flags;
456{
457	COMPQUIET(spp, NULL);
458	COMPQUIET(flags, 0);
459
460	return (__db_stat_not_built(dbc->env));
461}
462
463int
464__bam_stat_print(dbc, flags)
465	DBC *dbc;
466	u_int32_t flags;
467{
468	COMPQUIET(flags, 0);
469
470	return (__db_stat_not_built(dbc->env));
471}
472#endif
473
474#ifndef HAVE_BREW
475/*
476 * __bam_key_range --
477 *	Return proportion of keys relative to given key.  The numbers are
478 *	slightly skewed due to on page duplicates.
479 *
480 * PUBLIC: int __bam_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
481 */
482int
483__bam_key_range(dbc, dbt, kp, flags)
484	DBC *dbc;
485	DBT *dbt;
486	DB_KEY_RANGE *kp;
487	u_int32_t flags;
488{
489	BTREE_CURSOR *cp;
490	EPG *sp;
491	double factor;
492	int exact, ret;
493
494	COMPQUIET(flags, 0);
495
496	if ((ret = __bam_search(dbc, PGNO_INVALID,
497	    dbt, SR_STK_ONLY, 1, NULL, &exact)) != 0)
498		return (ret);
499
500	cp = (BTREE_CURSOR *)dbc->internal;
501	kp->less = kp->greater = 0.0;
502
503	factor = 1.0;
504
505	/* Correct the leaf page. */
506	cp->csp->entries /= 2;
507	cp->csp->indx /= 2;
508	for (sp = cp->sp; sp <= cp->csp; ++sp) {
509		/*
510		 * At each level we know that pages greater than indx contain
511		 * keys greater than what we are looking for and those less
512		 * than indx are less than.  The one pointed to by indx may
513		 * have some less, some greater or even equal.  If indx is
514		 * equal to the number of entries, then the key is out of range
515		 * and everything is less.
516		 */
517		if (sp->indx == 0)
518			kp->greater += factor * (sp->entries - 1)/sp->entries;
519		else if (sp->indx == sp->entries)
520			kp->less += factor;
521		else {
522			kp->less += factor * sp->indx / sp->entries;
523			kp->greater += factor *
524			    ((sp->entries - sp->indx) - 1) / sp->entries;
525		}
526		factor *= 1.0/sp->entries;
527	}
528
529	/*
530	 * If there was an exact match then assign 1 n'th to the key itself.
531	 * Otherwise that factor belongs to those greater than the key, unless
532	 * the key was out of range.
533	 */
534	if (exact)
535		kp->equal = factor;
536	else {
537		if (kp->less != 1)
538			kp->greater += factor;
539		kp->equal = 0;
540	}
541
542	BT_STK_CLR(cp);
543
544	return (0);
545}
546#endif
547
548/*
549 * __bam_traverse --
550 *	Walk a Btree database.
551 *
552 * PUBLIC: int __bam_traverse __P((DBC *, db_lockmode_t,
553 * PUBLIC:     db_pgno_t, int (*)(DBC *, PAGE *, void *, int *), void *));
554 */
555int
556__bam_traverse(dbc, mode, root_pgno, callback, cookie)
557	DBC *dbc;
558	db_lockmode_t mode;
559	db_pgno_t root_pgno;
560	int (*callback)__P((DBC *, PAGE *, void *, int *));
561	void *cookie;
562{
563	BINTERNAL *bi;
564	BKEYDATA *bk;
565	DB *dbp;
566	DB_LOCK lock;
567	DB_MPOOLFILE *mpf;
568	PAGE *h;
569	RINTERNAL *ri;
570	db_indx_t indx, *inp;
571	int already_put, ret, t_ret;
572
573	dbp = dbc->dbp;
574	mpf = dbp->mpf;
575	already_put = 0;
576
577	if ((ret = __db_lget(dbc, 0, root_pgno, mode, 0, &lock)) != 0)
578		return (ret);
579	if ((ret = __memp_fget(mpf, &root_pgno,
580	     dbc->thread_info, dbc->txn, 0, &h)) != 0) {
581		(void)__TLPUT(dbc, lock);
582		return (ret);
583	}
584
585	switch (TYPE(h)) {
586	case P_IBTREE:
587		for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
588			bi = GET_BINTERNAL(dbp, h, indx);
589			if (B_TYPE(bi->type) == B_OVERFLOW &&
590			    (ret = __db_traverse_big(dbc,
591			    ((BOVERFLOW *)bi->data)->pgno,
592			    callback, cookie)) != 0)
593				goto err;
594			if ((ret = __bam_traverse(
595			    dbc, mode, bi->pgno, callback, cookie)) != 0)
596				goto err;
597		}
598		break;
599	case P_IRECNO:
600		for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
601			ri = GET_RINTERNAL(dbp, h, indx);
602			if ((ret = __bam_traverse(
603			    dbc, mode, ri->pgno, callback, cookie)) != 0)
604				goto err;
605		}
606		break;
607	case P_LBTREE:
608		inp = P_INP(dbp, h);
609		for (indx = 0; indx < NUM_ENT(h); indx += P_INDX) {
610			bk = GET_BKEYDATA(dbp, h, indx);
611			if (B_TYPE(bk->type) == B_OVERFLOW &&
612			    (indx + P_INDX >= NUM_ENT(h) ||
613			    inp[indx] != inp[indx + P_INDX])) {
614				if ((ret = __db_traverse_big(dbc,
615				    GET_BOVERFLOW(dbp, h, indx)->pgno,
616				    callback, cookie)) != 0)
617					goto err;
618			}
619			bk = GET_BKEYDATA(dbp, h, indx + O_INDX);
620			if (B_TYPE(bk->type) == B_DUPLICATE &&
621			    (ret = __bam_traverse(dbc, mode,
622			    GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno,
623			    callback, cookie)) != 0)
624				goto err;
625			if (B_TYPE(bk->type) == B_OVERFLOW &&
626			    (ret = __db_traverse_big(dbc,
627			    GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno,
628			    callback, cookie)) != 0)
629				goto err;
630		}
631		break;
632	case P_LDUP:
633	case P_LRECNO:
634		for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
635			bk = GET_BKEYDATA(dbp, h, indx);
636			if (B_TYPE(bk->type) == B_OVERFLOW &&
637			    (ret = __db_traverse_big(dbc,
638			    GET_BOVERFLOW(dbp, h, indx)->pgno,
639			    callback, cookie)) != 0)
640				goto err;
641		}
642		break;
643	default:
644		return (__db_pgfmt(dbp->env, h->pgno));
645	}
646
647	ret = callback(dbc, h, cookie, &already_put);
648
649err:	if (!already_put && (t_ret = __memp_fput(mpf,
650	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
651		ret = t_ret;
652	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
653		ret = t_ret;
654
655	return (ret);
656}
657