1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: mp_fget.c,v 12.53 2008/04/28 02:59:57 alexg Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/log.h"
13#include "dbinc/mp.h"
14#include "dbinc/txn.h"
15
16/*
17 * __memp_fget_pp --
18 *	DB_MPOOLFILE->get pre/post processing.
19 *
20 * PUBLIC: int __memp_fget_pp
21 * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *));
22 */
23int
24__memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp)
25	DB_MPOOLFILE *dbmfp;
26	db_pgno_t *pgnoaddr;
27	DB_TXN *txnp;
28	u_int32_t flags;
29	void *addrp;
30{
31	DB_THREAD_INFO *ip;
32	ENV *env;
33	int rep_blocked, ret;
34
35	env = dbmfp->env;
36
37	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get");
38
39	/*
40	 * Validate arguments.
41	 *
42	 * !!!
43	 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
44	 * files here, and create non-existent pages in readonly files if the
45	 * flags are set, later.  The reason is that the hash access method
46	 * wants to get empty pages that don't really exist in readonly files.
47	 * The only alternative is for hash to write the last "bucket" all the
48	 * time, which we don't want to do because one of our big goals in life
49	 * is to keep database files small.  It's sleazy as hell, but we catch
50	 * any attempt to actually write the file in memp_fput().
51	 */
52#define	OKFLAGS		(DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \
53	    DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW)
54	if (flags != 0) {
55		if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0)
56			return (ret);
57
58		switch (flags) {
59		case DB_MPOOL_DIRTY:
60		case DB_MPOOL_CREATE:
61		case DB_MPOOL_EDIT:
62		case DB_MPOOL_LAST:
63		case DB_MPOOL_NEW:
64			break;
65		default:
66			return (__db_ferr(env, "memp_fget", 1));
67		}
68	}
69
70	ENV_ENTER(env, ip);
71
72	rep_blocked = 0;
73	if (txnp == NULL && IS_ENV_REPLICATED(env)) {
74		if ((ret = __op_rep_enter(env)) != 0)
75			goto err;
76		rep_blocked = 1;
77	}
78	ret = __memp_fget(dbmfp, pgnoaddr, ip, txnp, flags, addrp);
79	/*
80	 * We only decrement the count in op_rep_exit if the operation fails.
81	 * Otherwise the count will be decremented when the page is no longer
82	 * pinned in memp_fput.
83	 */
84	if (ret != 0 && rep_blocked)
85		(void)__op_rep_exit(env);
86
87	/* Similarly if an app has a page pinned it is ACTIVE. */
88err:	if (ret != 0)
89		ENV_LEAVE(env, ip);
90
91	return (ret);
92}
93
94/*
95 * __memp_fget --
96 *	Get a page from the file.
97 *
98 * PUBLIC: int __memp_fget __P((DB_MPOOLFILE *,
99 * PUBLIC:     db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *));
100 */
101int
102__memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp)
103	DB_MPOOLFILE *dbmfp;
104	db_pgno_t *pgnoaddr;
105	DB_THREAD_INFO *ip;
106	DB_TXN *txn;
107	u_int32_t flags;
108	void *addrp;
109{
110	enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
111	BH *alloc_bhp, *bhp, *frozen_bhp, *oldest_bhp;
112	ENV *env;
113	DB_LSN *read_lsnp, vlsn;
114	DB_MPOOL *dbmp;
115	DB_MPOOL_HASH *hp;
116	MPOOL *c_mp;
117	MPOOLFILE *mfp;
118	PIN_LIST *list, *lp;
119	REGINFO *infop, *t_infop, *reginfo;
120	TXN_DETAIL *td;
121	roff_t list_off, mf_offset;
122	u_int32_t pinmax, st_hsearch;
123	int b_incr, b_locked, dirty, edit, extending, first;
124	int makecopy, mvcc, need_free, ret;
125
126	*(void **)addrp = NULL;
127	COMPQUIET(c_mp, NULL);
128	COMPQUIET(infop, NULL);
129	COMPQUIET(oldest_bhp, NULL);
130
131	env = dbmfp->env;
132	dbmp = env->mp_handle;
133
134	mfp = dbmfp->mfp;
135	mvcc = mfp->multiversion;
136	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
137	alloc_bhp = bhp = frozen_bhp = NULL;
138	read_lsnp = NULL;
139	td = NULL;
140	hp = NULL;
141	b_incr = b_locked = extending = makecopy = ret = 0;
142
143	if (LF_ISSET(DB_MPOOL_DIRTY)) {
144		if (F_ISSET(dbmfp, MP_READONLY)) {
145			__db_errx(env,
146			    "%s: dirty flag set for readonly file page",
147			    __memp_fn(dbmfp));
148			return (EINVAL);
149		}
150		if ((ret = __db_fcchk(env, "DB_MPOOLFILE->get",
151		    flags, DB_MPOOL_DIRTY, DB_MPOOL_EDIT)) != 0)
152			return (ret);
153	}
154
155	dirty = LF_ISSET(DB_MPOOL_DIRTY);
156	edit = LF_ISSET(DB_MPOOL_EDIT);
157	LF_CLR(DB_MPOOL_DIRTY | DB_MPOOL_EDIT);
158
159	/*
160	 * If the transaction is being used to update a multiversion database
161	 * for the first time, set the read LSN.  In addition, if this is an
162	 * update, allocate a mutex.  If no transaction has been supplied, that
163	 * will be caught later, when we know whether one is required.
164	 */
165	if (mvcc && txn != NULL && txn->td != NULL) {
166		/* We're only interested in the ultimate parent transaction. */
167		while (txn->parent != NULL)
168			txn = txn->parent;
169		td = (TXN_DETAIL *)txn->td;
170		if (F_ISSET(txn, TXN_SNAPSHOT)) {
171			read_lsnp = &td->read_lsn;
172			if (IS_MAX_LSN(*read_lsnp) &&
173			    (ret = __log_current_lsn(env, read_lsnp,
174			    NULL, NULL)) != 0)
175				return (ret);
176		}
177		if ((dirty || LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) &&
178		    td->mvcc_mtx == MUTEX_INVALID && (ret =
179		    __mutex_alloc(env, MTX_TXN_MVCC, 0, &td->mvcc_mtx)) != 0)
180			return (ret);
181	}
182
183	switch (flags) {
184	case DB_MPOOL_LAST:
185		/* Get the last page number in the file. */
186		MUTEX_LOCK(env, mfp->mutex);
187		*pgnoaddr = mfp->last_pgno;
188		MUTEX_UNLOCK(env, mfp->mutex);
189		break;
190	case DB_MPOOL_NEW:
191		/*
192		 * If always creating a page, skip the first search
193		 * of the hash bucket.
194		 */
195		state = FIRST_MISS;
196		goto alloc;
197	case DB_MPOOL_CREATE:
198	default:
199		break;
200	}
201
202	/*
203	 * If mmap'ing the file and the page is not past the end of the file,
204	 * just return a pointer.  We can't use R_ADDR here: this is an offset
205	 * into an mmap'd file, not a shared region, and doesn't change for
206	 * private environments.
207	 *
208	 * The page may be past the end of the file, so check the page number
209	 * argument against the original length of the file.  If we previously
210	 * returned pages past the original end of the file, last_pgno will
211	 * have been updated to match the "new" end of the file, and checking
212	 * against it would return pointers past the end of the mmap'd region.
213	 *
214	 * If another process has opened the file for writing since we mmap'd
215	 * it, we will start playing the game by their rules, i.e. everything
216	 * goes through the cache.  All pages previously returned will be safe,
217	 * as long as the correct locking protocol was observed.
218	 *
219	 * We don't discard the map because we don't know when all of the
220	 * pages will have been discarded from the process' address space.
221	 * It would be possible to do so by reference counting the open
222	 * pages from the mmap, but it's unclear to me that it's worth it.
223	 */
224	if (dbmfp->addr != NULL &&
225	    F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
226		*(void **)addrp = (u_int8_t *)dbmfp->addr +
227		    (*pgnoaddr * mfp->stat.st_pagesize);
228		STAT(++mfp->stat.st_map);
229		return (0);
230	}
231
232retry:	/*
233	 * Determine the cache and hash bucket where this page lives and get
234	 * local pointers to them.  Reset on each pass through this code, the
235	 * page number can change.
236	 */
237	MP_GET_BUCKET(env, mfp, *pgnoaddr, &infop, hp, ret);
238	if (ret != 0)
239		return (ret);
240	c_mp = infop->primary;
241
242	/* Search the hash chain for the page. */
243	st_hsearch = 0;
244	b_locked = 1;
245	SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
246		++st_hsearch;
247		if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
248			continue;
249
250		/* Snapshot reads -- get the version visible at read_lsn. */
251		if (mvcc && !edit && read_lsnp != NULL) {
252			while (bhp != NULL &&
253			    !BH_OWNED_BY(env, bhp, txn) &&
254			    !BH_VISIBLE(env, bhp, read_lsnp, vlsn))
255				bhp = SH_CHAIN_PREV(bhp, vc, __bh);
256
257			DB_ASSERT(env, bhp != NULL);
258		}
259
260		makecopy = mvcc && dirty && !BH_OWNED_BY(env, bhp, txn);
261
262		if (F_ISSET(bhp, BH_FROZEN) && !F_ISSET(bhp, BH_FREED)) {
263			DB_ASSERT(env, frozen_bhp == NULL);
264			frozen_bhp = bhp;
265		}
266
267		/*
268		 * Increment the reference count.  We may discard the hash
269		 * bucket lock as we evaluate and/or read the buffer, so we
270		 * need to ensure it doesn't move and its contents remain
271		 * unchanged.
272		 */
273		if (bhp->ref == UINT16_MAX) {
274			__db_errx(env,
275			    "%s: page %lu: reference count overflow",
276			    __memp_fn(dbmfp), (u_long)bhp->pgno);
277			ret = __env_panic(env, EINVAL);
278			goto err;
279		}
280		++bhp->ref;
281		b_incr = 1;
282
283		/*
284		 * BH_LOCKED --
285		 * I/O is in progress or sync is waiting on the buffer to write
286		 * it.  Because we've incremented the buffer reference count,
287		 * we know the buffer can't move.  Unlock the bucket lock, wait
288		 * for the buffer to become available, re-acquire the bucket.
289		 */
290		for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
291		    !F_ISSET(env->dbenv, DB_ENV_NOLOCKING); first = 0) {
292			/*
293			 * If someone is trying to sync this buffer and the
294			 * buffer is hot, they may never get in.  Give up and
295			 * try again.
296			 */
297			if (!first && bhp->ref_sync != 0) {
298				--bhp->ref;
299				MUTEX_UNLOCK(env, hp->mtx_hash);
300				bhp = frozen_bhp = NULL;
301				b_incr = b_locked = 0;
302				__os_yield(env, 0, 1);
303				goto retry;
304			}
305
306			/*
307			 * If we're the first thread waiting on I/O, set the
308			 * flag so the thread doing I/O knows to wake us up,
309			 * and lock the mutex.
310			 */
311			if (!F_ISSET(hp, IO_WAITER)) {
312				F_SET(hp, IO_WAITER);
313				MUTEX_LOCK(env, hp->mtx_io);
314			}
315			STAT(++hp->hash_io_wait);
316
317			/* Release the hash bucket lock. */
318			MUTEX_UNLOCK(env, hp->mtx_hash);
319
320			/* Wait for I/O to finish. */
321			MUTEX_LOCK(env, hp->mtx_io);
322			MUTEX_UNLOCK(env, hp->mtx_io);
323
324			/* Re-acquire the hash bucket lock. */
325			MUTEX_LOCK(env, hp->mtx_hash);
326		}
327
328		/*
329		 * If the buffer was frozen before we waited for any I/O to
330		 * complete and is still frozen, we will need to thaw it.
331		 * Otherwise, it was thawed while we waited, and we need to
332		 * search again.
333		 */
334		if (frozen_bhp != NULL && F_ISSET(frozen_bhp, BH_THAWED)) {
335thawed:			need_free = (--frozen_bhp->ref == 0);
336			b_incr = 0;
337			MUTEX_UNLOCK(env, hp->mtx_hash);
338			MPOOL_REGION_LOCK(env, infop);
339			if (alloc_bhp != NULL) {
340				__memp_free(infop, mfp, alloc_bhp);
341				alloc_bhp = NULL;
342			}
343			if (need_free)
344				SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
345				    frozen_bhp, hq);
346			MPOOL_REGION_UNLOCK(env, infop);
347			bhp = frozen_bhp = NULL;
348			goto retry;
349		}
350
351		/*
352		 * If the buffer we wanted was frozen or thawed while we
353		 * waited, we need to start again.
354		 */
355		if (SH_CHAIN_HASNEXT(bhp, vc) &&
356		    SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off) {
357			--bhp->ref;
358			b_incr = 0;
359			MUTEX_UNLOCK(env, hp->mtx_hash);
360			bhp = frozen_bhp = NULL;
361			goto retry;
362		} else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) {
363			ret = DB_LOCK_DEADLOCK;
364			goto err;
365		}
366
367#ifdef HAVE_STATISTICS
368		++mfp->stat.st_cache_hit;
369#endif
370		break;
371	}
372
373#ifdef HAVE_STATISTICS
374	/*
375	 * Update the hash bucket search statistics -- do now because our next
376	 * search may be for a different bucket.
377	 */
378	++c_mp->stat.st_hash_searches;
379	if (st_hsearch > c_mp->stat.st_hash_longest)
380		c_mp->stat.st_hash_longest = st_hsearch;
381	c_mp->stat.st_hash_examined += st_hsearch;
382#endif
383
384	/*
385	 * There are 4 possible paths to this location:
386	 *
387	 * FIRST_MISS:
388	 *	Didn't find the page in the hash bucket on our first pass:
389	 *	bhp == NULL, alloc_bhp == NULL
390	 *
391	 * FIRST_FOUND:
392	 *	Found the page in the hash bucket on our first pass:
393	 *	bhp != NULL, alloc_bhp == NULL
394	 *
395	 * SECOND_FOUND:
396	 *	Didn't find the page in the hash bucket on the first pass,
397	 *	allocated space, and found the page in the hash bucket on
398	 *	our second pass:
399	 *	bhp != NULL, alloc_bhp != NULL
400	 *
401	 * SECOND_MISS:
402	 *	Didn't find the page in the hash bucket on the first pass,
403	 *	allocated space, and didn't find the page in the hash bucket
404	 *	on our second pass:
405	 *	bhp == NULL, alloc_bhp != NULL
406	 */
407	state = bhp == NULL ?
408	    (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
409	    (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
410
411	switch (state) {
412	case FIRST_FOUND:
413		/*
414		 * If we are to free the buffer, then this had better be the
415		 * only reference. If so, just free the buffer.  If not,
416		 * complain and get out.
417		 */
418		if (flags == DB_MPOOL_FREE) {
419			if (--bhp->ref == 0) {
420				if (F_ISSET(bhp, BH_DIRTY)) {
421					--hp->hash_page_dirty;
422					F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
423				}
424				/*
425				 * In a multiversion database, this page could
426				 * be requested again so we have to leave it in
427				 * cache for now.  It should *not* ever be
428				 * requested again for modification without an
429				 * intervening DB_MPOOL_CREATE or DB_MPOOL_NEW.
430				 *
431				 * Mark it with BH_FREED so we don't reuse the
432				 * data when the page is resurrected.
433				 */
434				if (mvcc && (F_ISSET(bhp, BH_FROZEN) ||
435				    !SH_CHAIN_SINGLETON(bhp, vc) ||
436				    bhp->td_off == INVALID_ROFF ||
437				    !IS_MAX_LSN(*VISIBLE_LSN(env, bhp)))) {
438					F_SET(bhp, BH_FREED);
439					MUTEX_UNLOCK(env, hp->mtx_hash);
440					return (0);
441				}
442				return (__memp_bhfree(
443				    dbmp, infop, hp, bhp, BH_FREE_FREEMEM));
444			}
445			__db_errx(env,
446			    "File %s: freeing pinned buffer for page %lu",
447				__memp_fns(dbmp, mfp), (u_long)*pgnoaddr);
448			ret = __env_panic(env, EINVAL);
449			goto err;
450		}
451
452		if (mvcc) {
453			if (flags == DB_MPOOL_CREATE &&
454			    F_ISSET(bhp, BH_FREED)) {
455				extending = makecopy = 1;
456				MUTEX_LOCK(env, mfp->mutex);
457				if (*pgnoaddr > mfp->last_pgno)
458					mfp->last_pgno = *pgnoaddr;
459				MUTEX_UNLOCK(env, mfp->mutex);
460			}
461
462			/*
463			 * With multiversion databases, we might need to
464			 * allocate a new buffer into which we can copy the one
465			 * that we found.  In that case, check the last buffer
466			 * in the chain to see whether we can reuse an obsolete
467			 * buffer.
468			 *
469			 * To provide snapshot isolation, we need to make sure
470			 * that we've seen a buffer older than the oldest
471			 * snapshot read LSN.
472			 */
473reuse:			if ((makecopy || frozen_bhp != NULL) && (oldest_bhp =
474			    SH_CHAIN_PREV(bhp, vc, __bh)) != NULL) {
475				while (SH_CHAIN_HASPREV(oldest_bhp, vc))
476					oldest_bhp = SH_CHAIN_PREVP(oldest_bhp,
477					    vc, __bh);
478
479				if (oldest_bhp->ref == 0 && !BH_OBSOLETE(
480				    oldest_bhp, hp->old_reader, vlsn) &&
481				    (ret = __txn_oldest_reader(env,
482				    &hp->old_reader)) != 0)
483					goto err;
484
485				if (BH_OBSOLETE(
486				    oldest_bhp, hp->old_reader, vlsn) &&
487				    oldest_bhp->ref == 0) {
488					if (F_ISSET(oldest_bhp, BH_FROZEN)) {
489						++oldest_bhp->ref;
490						if ((ret = __memp_bh_thaw(dbmp,
491						    infop, hp, oldest_bhp,
492						    NULL)) != 0)
493							goto err;
494						goto reuse;
495					} else if ((ret = __memp_bhfree(dbmp,
496					    infop, hp, oldest_bhp,
497					    BH_FREE_REUSE)) != 0)
498						goto err;
499					alloc_bhp = oldest_bhp;
500				}
501
502				DB_ASSERT(env, alloc_bhp == NULL ||
503				    !F_ISSET(alloc_bhp, BH_FROZEN));
504			}
505		}
506
507		/* We found the buffer or we're ready to copy -- we're done. */
508		if ((!makecopy && frozen_bhp == NULL) || alloc_bhp != NULL)
509			break;
510
511		/* FALLTHROUGH */
512	case FIRST_MISS:
513		/*
514		 * We didn't find the buffer in our first check.  Figure out
515		 * if the page exists, and allocate structures so we can add
516		 * the page to the buffer pool.
517		 */
518		MUTEX_UNLOCK(env, hp->mtx_hash);
519		b_locked = 0;
520
521		/*
522		 * The buffer is not in the pool, so we don't need to free it.
523		 */
524		if (flags == DB_MPOOL_FREE)
525			return (0);
526
527alloc:		/*
528		 * If DB_MPOOL_NEW is set, we have to allocate a page number.
529		 * If neither DB_MPOOL_CREATE or DB_MPOOL_NEW is set, then
530		 * it's an error to try and get a page past the end of file.
531		 */
532		DB_ASSERT(env, !b_locked);
533		MUTEX_LOCK(env, mfp->mutex);
534		switch (flags) {
535		case DB_MPOOL_NEW:
536			extending = 1;
537			if (mfp->maxpgno != 0 &&
538			    mfp->last_pgno >= mfp->maxpgno) {
539				__db_errx(
540				    env, "%s: file limited to %lu pages",
541				    __memp_fn(dbmfp), (u_long)mfp->maxpgno);
542				ret = ENOSPC;
543			} else
544				*pgnoaddr = mfp->last_pgno + 1;
545			break;
546		case DB_MPOOL_CREATE:
547			if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) {
548				__db_errx(
549				    env, "%s: file limited to %lu pages",
550				    __memp_fn(dbmfp), (u_long)mfp->maxpgno);
551				ret = ENOSPC;
552			} else if (!extending)
553				extending = *pgnoaddr > mfp->last_pgno;
554			break;
555		default:
556			ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
557			break;
558		}
559		MUTEX_UNLOCK(env, mfp->mutex);
560		if (ret != 0)
561			goto err;
562
563		/*
564		 * !!!
565		 * In the DB_MPOOL_NEW code path, infop and c_mp have
566		 * not yet been initialized.
567		 */
568		MP_GET_REGION(dbmfp, *pgnoaddr, &infop, ret);
569		if (ret != 0)
570			goto err;
571		c_mp = infop->primary;
572
573		/* Allocate a new buffer header and data space. */
574		if ((ret =
575		    __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0)
576			goto err;
577#ifdef DIAGNOSTIC
578		if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
579			__db_errx(env,
580		    "DB_MPOOLFILE->get: buffer data is NOT size_t aligned");
581			ret = __env_panic(env, EINVAL);
582			goto err;
583		}
584#endif
585		/*
586		 * If we are extending the file, we'll need the mfp lock
587		 * again.
588		 */
589		if (extending)
590			MUTEX_LOCK(env, mfp->mutex);
591
592		/*
593		 * DB_MPOOL_NEW does not guarantee you a page unreferenced by
594		 * any other thread of control.  (That guarantee is interesting
595		 * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
596		 * did not specify the page number, and so, may reasonably not
597		 * have any way to lock the page outside of mpool.) Regardless,
598		 * if we allocate the page, and some other thread of control
599		 * requests the page by number, we will not detect that and the
600		 * thread of control that allocated using DB_MPOOL_NEW may not
601		 * have a chance to initialize the page.  (Note: we *could*
602		 * detect this case if we set a flag in the buffer header which
603		 * guaranteed that no gets of the page would succeed until the
604		 * reference count went to 0, that is, until the creating page
605		 * put the page.)  What we do guarantee is that if two threads
606		 * of control are both doing DB_MPOOL_NEW calls, they won't
607		 * collide, that is, they won't both get the same page.
608		 *
609		 * There's a possibility that another thread allocated the page
610		 * we were planning to allocate while we were off doing buffer
611		 * allocation.  We can do that by making sure the page number
612		 * we were going to use is still available.  If it's not, then
613		 * we check to see if the next available page number hashes to
614		 * the same mpool region as the old one -- if it does, we can
615		 * continue, otherwise, we have to start over.
616		 */
617		if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
618			*pgnoaddr = mfp->last_pgno + 1;
619			MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop,ret);
620			if (ret != 0)
621				goto err;
622			if (t_infop != infop) {
623				/*
624				 * flags == DB_MPOOL_NEW, so extending is set
625				 * and we're holding the mfp locked.
626				 */
627				MUTEX_UNLOCK(env, mfp->mutex);
628
629				MPOOL_REGION_LOCK(env, infop);
630				__memp_free(infop, mfp, alloc_bhp);
631				c_mp->stat.st_pages--;
632				MPOOL_REGION_UNLOCK(env, infop);
633
634				alloc_bhp = NULL;
635				goto alloc;
636			}
637		}
638
639		/*
640		 * We released the mfp lock, so another thread might have
641		 * extended the file.  Update the last_pgno and initialize
642		 * the file, as necessary, if we extended the file.
643		 */
644		if (extending) {
645			if (*pgnoaddr > mfp->last_pgno)
646				mfp->last_pgno = *pgnoaddr;
647
648			MUTEX_UNLOCK(env, mfp->mutex);
649			if (ret != 0)
650				goto err;
651		}
652
653		/*
654		 * If we're doing copy-on-write, we will already have the
655		 * buffer header.  In that case, we don't need to search again.
656		 */
657		if (bhp != NULL) {
658			MUTEX_LOCK(env, hp->mtx_hash);
659			b_locked = 1;
660			break;
661		}
662		DB_ASSERT(env, frozen_bhp == NULL);
663		goto retry;
664	case SECOND_FOUND:
665		/*
666		 * We allocated buffer space for the requested page, but then
667		 * found the page in the buffer cache on our second check.
668		 * That's OK -- we can use the page we found in the pool,
669		 * unless DB_MPOOL_NEW is set.  If we're about to copy-on-write,
670		 * this is exactly the situation we want.
671		 *
672		 * For multiversion files, we may have left some pages in cache
673		 * beyond the end of a file after truncating.  In that case, we
674		 * would get to here with extending set.  If so, we need to
675		 * insert the new page in the version chain similar to when
676		 * we copy on write.
677		 */
678		if (extending && F_ISSET(bhp, BH_FREED))
679			makecopy = 1;
680		if (makecopy || frozen_bhp != NULL)
681			break;
682
683		/* Free the allocated memory, we no longer need it.  Since we
684		 * can't acquire the region lock while holding the hash bucket
685		 * lock, we have to release the hash bucket and re-acquire it.
686		 * That's OK, because we have the buffer pinned down.
687		 */
688		MUTEX_UNLOCK(env, hp->mtx_hash);
689		MPOOL_REGION_LOCK(env, infop);
690		__memp_free(infop, mfp, alloc_bhp);
691		c_mp->stat.st_pages--;
692		MPOOL_REGION_UNLOCK(env, infop);
693		alloc_bhp = NULL;
694
695		/*
696		 * We can't use the page we found in the pool if DB_MPOOL_NEW
697		 * was set.  (For details, see the above comment beginning
698		 * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
699		 * any other thread of control".)  If DB_MPOOL_NEW is set, we
700		 * release our pin on this particular buffer, and try to get
701		 * another one.
702		 */
703		if (flags == DB_MPOOL_NEW) {
704			--bhp->ref;
705			b_incr = b_locked = 0;
706			bhp = NULL;
707			goto alloc;
708		}
709
710		/* We can use the page -- get the bucket lock. */
711		MUTEX_LOCK(env, hp->mtx_hash);
712		break;
713	case SECOND_MISS:
714		/*
715		 * We allocated buffer space for the requested page, and found
716		 * the page still missing on our second pass through the buffer
717		 * cache.  Instantiate the page.
718		 */
719		bhp = alloc_bhp;
720		alloc_bhp = NULL;
721
722		/*
723		 * Initialize all the BH and hash bucket fields so we can call
724		 * __memp_bhfree if an error occurs.
725		 *
726		 * Append the buffer to the tail of the bucket list and update
727		 * the hash bucket's priority.
728		 */
729		/*lint --e{668} (flexelint: bhp cannot be NULL). */
730#ifdef DIAG_MVCC
731		memset(bhp, 0, SSZ(BH, align_off));
732#else
733		memset(bhp, 0, sizeof(BH));
734#endif
735		bhp->ref = 1;
736		b_incr = 1;
737		bhp->priority = UINT32_MAX;
738		bhp->pgno = *pgnoaddr;
739		bhp->mf_offset = mf_offset;
740		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
741		SH_CHAIN_INIT(bhp, vc);
742
743		/* We created a new page, it starts dirty. */
744		if (extending) {
745			++hp->hash_page_dirty;
746			F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
747		}
748
749		/*
750		 * If we created the page, zero it out.  If we didn't create
751		 * the page, read from the backing file.
752		 *
753		 * !!!
754		 * DB_MPOOL_NEW doesn't call the pgin function.
755		 *
756		 * If DB_MPOOL_CREATE is used, then the application's pgin
757		 * function has to be able to handle pages of 0's -- if it
758		 * uses DB_MPOOL_NEW, it can detect all of its page creates,
759		 * and not bother.
760		 *
761		 * If we're running in diagnostic mode, smash any bytes on the
762		 * page that are unknown quantities for the caller.
763		 *
764		 * Otherwise, read the page into memory, optionally creating it
765		 * if DB_MPOOL_CREATE is set.
766		 */
767		if (extending) {
768			MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
769			    PROT_READ | PROT_WRITE);
770			if (mfp->clear_len == DB_CLEARLEN_NOTSET)
771				memset(bhp->buf, 0, mfp->stat.st_pagesize);
772			else {
773				memset(bhp->buf, 0, mfp->clear_len);
774#if defined(DIAGNOSTIC) || defined(UMRW)
775				memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
776				    mfp->stat.st_pagesize - mfp->clear_len);
777#endif
778			}
779
780			if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
781				F_SET(bhp, BH_CALLPGIN);
782
783			STAT(++mfp->stat.st_page_create);
784		} else {
785			F_SET(bhp, BH_TRASH);
786			STAT(++mfp->stat.st_cache_miss);
787		}
788
789		/* Increment buffer count referenced by MPOOLFILE. */
790		MUTEX_LOCK(env, mfp->mutex);
791		++mfp->block_cnt;
792		MUTEX_UNLOCK(env, mfp->mutex);
793	}
794
795	DB_ASSERT(env, bhp != NULL);
796	DB_ASSERT(env, bhp->ref != 0);
797
798	/* We've got a buffer header we're re-instantiating. */
799	if (frozen_bhp != NULL) {
800		DB_ASSERT(env, alloc_bhp != NULL);
801
802		/*
803		 * If the empty buffer has been filled in the meantime, don't
804		 * overwrite it.
805		 */
806		if (F_ISSET(frozen_bhp, BH_THAWED))
807			goto thawed;
808		else {
809			if ((ret = __memp_bh_thaw(dbmp, infop, hp,
810			    frozen_bhp, alloc_bhp)) != 0)
811				goto err;
812			bhp = alloc_bhp;
813		}
814
815		frozen_bhp = alloc_bhp = NULL;
816
817		/*
818		 * If we're updating a buffer that was frozen, we have to go
819		 * through all of that again to allocate another buffer to hold
820		 * the new copy.
821		 */
822		if (makecopy) {
823			MUTEX_UNLOCK(env, hp->mtx_hash);
824			b_locked = 0;
825			goto alloc;
826		}
827	}
828
829	/*
830	 * BH_TRASH --
831	 * The buffer we found may need to be filled from the disk.
832	 *
833	 * It's possible for the read function to fail, which means we fail as
834	 * well.  Note, the __memp_pgread() function discards and reacquires
835	 * the hash lock, so the buffer must be pinned down so that it cannot
836	 * move and its contents are unchanged.  Discard the buffer on failure
837	 * unless another thread is waiting on our I/O to complete.  It's OK to
838	 * leave the buffer around, as the waiting thread will see the BH_TRASH
839	 * flag set, and will also attempt to discard it.  If there's a waiter,
840	 * we need to decrement our reference count.
841	 */
842	if (F_ISSET(bhp, BH_TRASH) &&
843	    (ret = __memp_pgread(dbmfp,
844	    hp, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
845		goto err;
846
847	/*
848	 * BH_CALLPGIN --
849	 * The buffer was processed for being written to disk, and now has
850	 * to be re-converted for use.
851	 */
852	if (F_ISSET(bhp, BH_CALLPGIN)) {
853		MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
854		    PROT_READ | PROT_WRITE);
855		if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
856			goto err;
857		F_CLR(bhp, BH_CALLPGIN);
858	}
859
860	/* Copy-on-write. */
861	if (makecopy && state != SECOND_MISS) {
862		DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
863		DB_ASSERT(env, bhp != NULL);
864		DB_ASSERT(env, alloc_bhp != NULL);
865		DB_ASSERT(env, alloc_bhp != bhp);
866
867		if (bhp->ref == 1)
868			MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
869			    PROT_READ);
870
871		alloc_bhp->ref = 1;
872		alloc_bhp->ref_sync = 0;
873		alloc_bhp->flags = F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
874		F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
875		alloc_bhp->priority = bhp->priority;
876		alloc_bhp->pgno = bhp->pgno;
877		alloc_bhp->mf_offset = bhp->mf_offset;
878		alloc_bhp->td_off = INVALID_ROFF;
879		if (txn != NULL &&
880		    (ret = __memp_bh_settxn(dbmp, mfp, alloc_bhp, td)) != 0)
881			goto err;
882		if (extending) {
883			memset(alloc_bhp->buf, 0, mfp->stat.st_pagesize);
884			F_SET(alloc_bhp, BH_DIRTY_CREATE);
885		} else
886			memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize);
887
888		SH_CHAIN_INSERT_AFTER(bhp, alloc_bhp, vc, __bh);
889		SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket,
890		    bhp, alloc_bhp, hq, __bh);
891		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
892		if (--bhp->ref == 0) {
893			bhp->priority = c_mp->lru_count;
894			MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0);
895		}
896		bhp = alloc_bhp;
897
898		if (alloc_bhp != oldest_bhp) {
899			MUTEX_LOCK(env, mfp->mutex);
900			++mfp->block_cnt;
901			MUTEX_UNLOCK(env, mfp->mutex);
902		}
903
904		alloc_bhp = NULL;
905	} else if (mvcc && extending && txn != NULL &&
906	    (ret = __memp_bh_settxn(dbmp, mfp, bhp, td)) != 0)
907		goto err;
908
909	if ((dirty || edit || extending) && !F_ISSET(bhp, BH_DIRTY)) {
910		DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
911		++hp->hash_page_dirty;
912		F_SET(bhp, BH_DIRTY);
913	}
914
915	/*
916	 * If we're the only reference, update buffer priority.  We may be
917	 * about to release the hash bucket lock, and everything should be
918	 * correct, first.  (We've already done this work if we created the
919	 * buffer, so there is no need to do it again.)
920	 */
921	if (state != SECOND_MISS && bhp->ref == 1) {
922		bhp->priority = UINT32_MAX;
923		if (SH_CHAIN_SINGLETON(bhp, vc)) {
924			if (bhp != SH_TAILQ_LAST(&hp->hash_bucket, hq, __bh)) {
925				SH_TAILQ_REMOVE(&hp->hash_bucket,
926				    bhp, hq, __bh);
927				SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
928			}
929		}
930	}
931
932	MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ |
933	    (dirty || edit || extending || F_ISSET(bhp, BH_DIRTY) ?
934	    PROT_WRITE : 0));
935
936#ifdef DIAGNOSTIC
937	{
938	BH *next_bhp = SH_CHAIN_NEXT(bhp, vc, __bh);
939
940	DB_ASSERT(env, !mfp->multiversion ||
941	    !F_ISSET(bhp, BH_DIRTY) || next_bhp == NULL);
942
943	DB_ASSERT(env, !mvcc || edit || read_lsnp == NULL ||
944	    bhp->td_off == INVALID_ROFF || BH_OWNED_BY(env, bhp, txn) ||
945	    (BH_VISIBLE(env, bhp, read_lsnp, vlsn) &&
946	    (next_bhp == NULL || F_ISSET(next_bhp, BH_FROZEN) ||
947	    (next_bhp->td_off != INVALID_ROFF &&
948	    (BH_OWNER(env, next_bhp)->status != TXN_COMMITTED ||
949	    !BH_VISIBLE(env, next_bhp, read_lsnp, vlsn))))));
950	}
951#endif
952
953	MUTEX_UNLOCK(env, hp->mtx_hash);
954	/*
955	 * Record this pin for this thread.  Holding the page pinned
956	 * without recording the pin is ok since we do not recover from
957	 * a death from within the library itself.
958	 */
959	if (ip != NULL) {
960		reginfo = env->reginfo;
961		if (ip->dbth_pincount == ip->dbth_pinmax) {
962			pinmax = ip->dbth_pinmax;
963			if ((ret = __env_alloc(reginfo,
964			    2 * pinmax * sizeof(PIN_LIST), &list)) != 0)
965				goto err;
966
967			memcpy(list, R_ADDR(reginfo, ip->dbth_pinlist),
968			    pinmax * sizeof(PIN_LIST));
969			memset(&list[pinmax], 0, pinmax * sizeof(PIN_LIST));
970			list_off = R_OFFSET(reginfo, list);
971			list = R_ADDR(reginfo, ip->dbth_pinlist);
972			ip->dbth_pinmax = 2 * pinmax;
973			ip->dbth_pinlist = list_off;
974			if (list != ip->dbth_pinarray)
975				__env_alloc_free(reginfo, list);
976		}
977		list = R_ADDR(reginfo, ip->dbth_pinlist);
978		for (lp = list; lp < &list[ip->dbth_pinmax]; lp++)
979			if (lp->b_ref == INVALID_ROFF)
980				break;
981
982		ip->dbth_pincount++;
983		lp->b_ref = R_OFFSET(infop, bhp);
984		lp->region = (int)(infop - dbmp->reginfo);
985	}
986
987#ifdef DIAGNOSTIC
988	/* Update the file's pinned reference count. */
989	MPOOL_SYSTEM_LOCK(env);
990	++dbmfp->pinref;
991	MPOOL_SYSTEM_UNLOCK(env);
992
993	/*
994	 * We want to switch threads as often as possible, and at awkward
995	 * times.  Yield every time we get a new page to ensure contention.
996	 */
997	if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU))
998		__os_yield(env, 0, 0);
999#endif
1000
1001	DB_ASSERT(env, alloc_bhp == NULL);
1002
1003	*(void **)addrp = bhp->buf;
1004	return (0);
1005
1006err:	/*
1007	 * Discard our reference.  If we're the only reference, discard the
1008	 * the buffer entirely.  If we held a reference to a buffer, we are
1009	 * also still holding the hash bucket mutex.
1010	 */
1011	if (b_incr || frozen_bhp != NULL) {
1012		if (!b_locked) {
1013			MUTEX_LOCK(env, hp->mtx_hash);
1014			b_locked = 1;
1015		}
1016		if (frozen_bhp != NULL)
1017			--frozen_bhp->ref;
1018		if (b_incr && bhp != frozen_bhp)
1019			--bhp->ref;
1020	}
1021	if (b_locked)
1022		MUTEX_UNLOCK(env, hp->mtx_hash);
1023
1024	/* If alloc_bhp is set, free the memory. */
1025	if (alloc_bhp != NULL) {
1026		MPOOL_REGION_LOCK(env, infop);
1027		__memp_free(infop, mfp, alloc_bhp);
1028		c_mp->stat.st_pages--;
1029		MPOOL_REGION_UNLOCK(env, infop);
1030	}
1031
1032	return (ret);
1033}
1034