1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/log.h"
13#include "dbinc/mp.h"
14#include "dbinc/txn.h"
15
16/*
17 * __memp_fget_pp --
18 *	DB_MPOOLFILE->get pre/post processing.
19 *
20 * PUBLIC: int __memp_fget_pp
21 * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *));
22 */
23int
24__memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp)
25	DB_MPOOLFILE *dbmfp;
26	db_pgno_t *pgnoaddr;
27	DB_TXN *txnp;
28	u_int32_t flags;
29	void *addrp;
30{
31	DB_THREAD_INFO *ip;
32	ENV *env;
33	int rep_blocked, ret;
34
35	env = dbmfp->env;
36
37	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get");
38
39	/*
40	 * Validate arguments.
41	 *
42	 * !!!
43	 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
44	 * files here, and create non-existent pages in readonly files if the
45	 * flags are set, later.  The reason is that the hash access method
46	 * wants to get empty pages that don't really exist in readonly files.
47	 * The only alternative is for hash to write the last "bucket" all the
48	 * time, which we don't want to do because one of our big goals in life
49	 * is to keep database files small.  It's sleazy as hell, but we catch
50	 * any attempt to actually write the file in memp_fput().
51	 */
52#define	OKFLAGS		(DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \
53	    DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW)
54	if (flags != 0) {
55		if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0)
56			return (ret);
57
58		switch (flags) {
59		case DB_MPOOL_DIRTY:
60		case DB_MPOOL_CREATE:
61		case DB_MPOOL_EDIT:
62		case DB_MPOOL_LAST:
63		case DB_MPOOL_NEW:
64			break;
65		default:
66			return (__db_ferr(env, "memp_fget", 1));
67		}
68	}
69
70	ENV_ENTER(env, ip);
71
72	rep_blocked = 0;
73	if (txnp == NULL && IS_ENV_REPLICATED(env)) {
74		if ((ret = __op_rep_enter(env)) != 0)
75			goto err;
76		rep_blocked = 1;
77	}
78	ret = __memp_fget(dbmfp, pgnoaddr, ip, txnp, flags, addrp);
79	/*
80	 * We only decrement the count in op_rep_exit if the operation fails.
81	 * Otherwise the count will be decremented when the page is no longer
82	 * pinned in memp_fput.
83	 */
84	if (ret != 0 && rep_blocked)
85		(void)__op_rep_exit(env);
86
87	/* Similarly if an app has a page pinned it is ACTIVE. */
88err:	if (ret != 0)
89		ENV_LEAVE(env, ip);
90
91	return (ret);
92}
93
94/*
95 * __memp_fget --
96 *	Get a page from the file.
97 *
98 * PUBLIC: int __memp_fget __P((DB_MPOOLFILE *,
99 * PUBLIC:     db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *));
100 */
101int
102__memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp)
103	DB_MPOOLFILE *dbmfp;
104	db_pgno_t *pgnoaddr;
105	DB_THREAD_INFO *ip;
106	DB_TXN *txn;
107	u_int32_t flags;
108	void *addrp;
109{
110	enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
111	BH *alloc_bhp, *bhp, *oldest_bhp;
112	ENV *env;
113	DB_LSN *read_lsnp, vlsn;
114	DB_MPOOL *dbmp;
115	DB_MPOOL_HASH *hp;
116	MPOOL *c_mp;
117	MPOOLFILE *mfp;
118	PIN_LIST *list, *lp;
119	REGENV *renv;
120	REGINFO *infop, *t_infop, *reginfo;
121	TXN_DETAIL *td;
122	roff_t list_off, mf_offset;
123	u_int32_t bucket, pinmax, st_hsearch;
124	int b_incr, b_lock, h_locked, dirty, extending;
125	int makecopy, mvcc, need_free, ret;
126
127	*(void **)addrp = NULL;
128	COMPQUIET(c_mp, NULL);
129	COMPQUIET(infop, NULL);
130
131	env = dbmfp->env;
132	dbmp = env->mp_handle;
133
134	mfp = dbmfp->mfp;
135	mvcc = mfp->multiversion && (txn != NULL);
136	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
137	alloc_bhp = bhp = oldest_bhp = NULL;
138	read_lsnp = NULL;
139	td = NULL;
140	hp = NULL;
141	b_incr = b_lock = h_locked = extending = makecopy = ret = 0;
142
143	if (LF_ISSET(DB_MPOOL_DIRTY)) {
144		if (F_ISSET(dbmfp, MP_READONLY)) {
145			__db_errx(env,
146			    "%s: dirty flag set for readonly file page",
147			    __memp_fn(dbmfp));
148			return (EINVAL);
149		}
150		if ((ret = __db_fcchk(env, "DB_MPOOLFILE->get",
151		    flags, DB_MPOOL_DIRTY, DB_MPOOL_EDIT)) != 0)
152			return (ret);
153	}
154
155	dirty = LF_ISSET(DB_MPOOL_DIRTY | DB_MPOOL_EDIT | DB_MPOOL_FREE);
156	LF_CLR(DB_MPOOL_DIRTY | DB_MPOOL_EDIT);
157
158	/*
159	 * If the transaction is being used to update a multiversion database
160	 * for the first time, set the read LSN.  In addition, if this is an
161	 * update, allocate a mutex.  If no transaction has been supplied, that
162	 * will be caught later, when we know whether one is required.
163	 */
164	if (mvcc && txn != NULL && txn->td != NULL) {
165		/* We're only interested in the ultimate parent transaction. */
166		while (txn->parent != NULL)
167			txn = txn->parent;
168		td = (TXN_DETAIL *)txn->td;
169		if (F_ISSET(txn, TXN_SNAPSHOT)) {
170			read_lsnp = &td->read_lsn;
171			if (IS_MAX_LSN(*read_lsnp) &&
172			    (ret = __log_current_lsn(env, read_lsnp,
173			    NULL, NULL)) != 0)
174				return (ret);
175		}
176		if ((dirty || LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) &&
177		    td->mvcc_mtx == MUTEX_INVALID && (ret =
178		    __mutex_alloc(env, MTX_TXN_MVCC, 0, &td->mvcc_mtx)) != 0)
179			return (ret);
180	}
181
182	switch (flags) {
183	case DB_MPOOL_LAST:
184		/* Get the last page number in the file. */
185		MUTEX_LOCK(env, mfp->mutex);
186		*pgnoaddr = mfp->last_pgno;
187		MUTEX_UNLOCK(env, mfp->mutex);
188		break;
189	case DB_MPOOL_NEW:
190		/*
191		 * If always creating a page, skip the first search
192		 * of the hash bucket.
193		 */
194		goto newpg;
195	case DB_MPOOL_CREATE:
196	default:
197		break;
198	}
199
200	/*
201	 * If mmap'ing the file and the page is not past the end of the file,
202	 * just return a pointer.  We can't use R_ADDR here: this is an offset
203	 * into an mmap'd file, not a shared region, and doesn't change for
204	 * private environments.
205	 *
206	 * The page may be past the end of the file, so check the page number
207	 * argument against the original length of the file.  If we previously
208	 * returned pages past the original end of the file, last_pgno will
209	 * have been updated to match the "new" end of the file, and checking
210	 * against it would return pointers past the end of the mmap'd region.
211	 *
212	 * If another process has opened the file for writing since we mmap'd
213	 * it, we will start playing the game by their rules, i.e. everything
214	 * goes through the cache.  All pages previously returned will be safe,
215	 * as long as the correct locking protocol was observed.
216	 *
217	 * We don't discard the map because we don't know when all of the
218	 * pages will have been discarded from the process' address space.
219	 * It would be possible to do so by reference counting the open
220	 * pages from the mmap, but it's unclear to me that it's worth it.
221	 */
222	if (dbmfp->addr != NULL &&
223	    F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
224		*(void **)addrp = (u_int8_t *)dbmfp->addr +
225		    (*pgnoaddr * mfp->stat.st_pagesize);
226		STAT(++mfp->stat.st_map);
227		return (0);
228	}
229
230	/*
231	 * Determine the cache and hash bucket where this page lives and get
232	 * local pointers to them.  Reset on each pass through this code, the
233	 * page number can change.
234	 */
235	MP_GET_BUCKET(env, mfp, *pgnoaddr, &infop, hp, bucket, ret);
236	if (ret != 0)
237		return (ret);
238	c_mp = infop->primary;
239
240	if (0) {
241		/* if we search again, get an exclusive lock. */
242retry:		MUTEX_LOCK(env, hp->mtx_hash);
243	}
244
245	/* Search the hash chain for the page. */
246	st_hsearch = 0;
247	h_locked = 1;
248	SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
249		++st_hsearch;
250		if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
251			continue;
252
253		/* Snapshot reads -- get the version visible at read_lsn. */
254		if (read_lsnp != NULL) {
255			while (bhp != NULL &&
256			    !BH_OWNED_BY(env, bhp, txn) &&
257			    !BH_VISIBLE(env, bhp, read_lsnp, vlsn))
258				bhp = SH_CHAIN_PREV(bhp, vc, __bh);
259
260			/*
261			 * We can get a null bhp if we are looking for a
262			 * page that was created after the transaction was
263			 * started so its not visible  (i.e. page added to
264			 * the BTREE in a subsequent txn).
265			 */
266			if (bhp == NULL) {
267				ret = DB_PAGE_NOTFOUND;
268				goto err;
269			}
270		}
271
272		makecopy = mvcc && dirty && !BH_OWNED_BY(env, bhp, txn);
273
274		/*
275		 * Increment the reference count.  This signals that the
276		 * buffer may not be discarded.  We must drop the hash
277		 * mutex before we lock the buffer mutex.
278		 */
279		if (BH_REFCOUNT(bhp) == UINT16_MAX) {
280			__db_errx(env,
281			    "%s: page %lu: reference count overflow",
282			    __memp_fn(dbmfp), (u_long)bhp->pgno);
283			ret = __env_panic(env, EINVAL);
284			goto err;
285		}
286		atomic_inc(env, &bhp->ref);
287		b_incr = 1;
288
289		/*
290		 * Lock the buffer. If the page is being read in or modified it
291		 * will be exclusively locked and we will block.
292		 */
293		MUTEX_UNLOCK(env, hp->mtx_hash);
294		h_locked = 0;
295		if (dirty || extending || makecopy || F_ISSET(bhp, BH_FROZEN)) {
296xlatch:			if (LF_ISSET(DB_MPOOL_TRY)) {
297				if ((ret =
298				    MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0)
299					goto err;
300			} else
301				MUTEX_LOCK(env, bhp->mtx_buf);
302			F_SET(bhp, BH_EXCLUSIVE);
303		} else if (LF_ISSET(DB_MPOOL_TRY)) {
304			if ((ret = MUTEX_TRY_READLOCK(env, bhp->mtx_buf)) != 0)
305				goto err;
306		} else
307			MUTEX_READLOCK(env, bhp->mtx_buf);
308
309#ifdef HAVE_SHARED_LATCHES
310		/*
311		 * If buffer is still in transit once we have a shared latch,
312		 * upgrade to an exclusive latch.
313		 */
314		if (F_ISSET(bhp, BH_FREED | BH_TRASH) &&
315		    !F_ISSET(bhp, BH_EXCLUSIVE)) {
316			MUTEX_UNLOCK(env, bhp->mtx_buf);
317			goto xlatch;
318		}
319#else
320		F_SET(bhp, BH_EXCLUSIVE);
321#endif
322		b_lock = 1;
323
324		/*
325		 * If the buffer was frozen before we waited for any I/O to
326		 * complete and is still frozen, we will need to thaw it.
327		 * Otherwise, it was thawed while we waited, and we need to
328		 * search again.
329		 */
330		if (F_ISSET(bhp, BH_THAWED)) {
331thawed:			need_free = (atomic_dec(env, &bhp->ref) == 0);
332			b_incr = 0;
333			MUTEX_UNLOCK(env, bhp->mtx_buf);
334			b_lock = 0;
335			if (need_free) {
336				MPOOL_REGION_LOCK(env, infop);
337				SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
338				    bhp, hq);
339				MPOOL_REGION_UNLOCK(env, infop);
340			}
341			bhp = NULL;
342			goto retry;
343		}
344
345		/*
346		 * If the buffer we wanted was frozen or thawed while we
347		 * waited, we need to start again.  That is indicated by
348		 * a new buffer header in the version chain owned by the same
349		 * transaction as the one we pinned.
350		 *
351		 * Also, if we're doing an unversioned read on a multiversion
352		 * file, another thread may have dirtied this buffer while we
353		 * swapped from the hash bucket lock to the buffer lock.
354		 */
355		if (SH_CHAIN_HASNEXT(bhp, vc) &&
356		    (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off ||
357		    (!dirty && read_lsnp == NULL))) {
358			DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) != 0);
359			atomic_dec(env, &bhp->ref);
360			b_incr = 0;
361			MUTEX_UNLOCK(env, bhp->mtx_buf);
362			b_lock = 0;
363			bhp = NULL;
364			goto retry;
365		} else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) {
366			ret = DB_LOCK_DEADLOCK;
367			goto err;
368		} else if (F_ISSET(bhp, BH_FREED) && flags != DB_MPOOL_CREATE &&
369		    flags != DB_MPOOL_NEW && flags != DB_MPOOL_FREE) {
370			ret = DB_PAGE_NOTFOUND;
371			goto err;
372		}
373
374		STAT(++mfp->stat.st_cache_hit);
375		break;
376	}
377
378#ifdef HAVE_STATISTICS
379	/*
380	 * Update the hash bucket search statistics -- do now because our next
381	 * search may be for a different bucket.
382	 */
383	++c_mp->stat.st_hash_searches;
384	if (st_hsearch > c_mp->stat.st_hash_longest)
385		c_mp->stat.st_hash_longest = st_hsearch;
386	c_mp->stat.st_hash_examined += st_hsearch;
387#endif
388
389	/*
390	 * There are 4 possible paths to this location:
391	 *
392	 * FIRST_MISS:
393	 *	Didn't find the page in the hash bucket on our first pass:
394	 *	bhp == NULL, alloc_bhp == NULL
395	 *
396	 * FIRST_FOUND:
397	 *	Found the page in the hash bucket on our first pass:
398	 *	bhp != NULL, alloc_bhp == NULL
399	 *
400	 * SECOND_FOUND:
401	 *	Didn't find the page in the hash bucket on the first pass,
402	 *	allocated space, and found the page in the hash bucket on
403	 *	our second pass:
404	 *	bhp != NULL, alloc_bhp != NULL
405	 *
406	 * SECOND_MISS:
407	 *	Didn't find the page in the hash bucket on the first pass,
408	 *	allocated space, and didn't find the page in the hash bucket
409	 *	on our second pass:
410	 *	bhp == NULL, alloc_bhp != NULL
411	 */
412	state = bhp == NULL ?
413	    (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
414	    (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
415
416	switch (state) {
417	case FIRST_FOUND:
418		/*
419		 * If we are to free the buffer, then this had better be the
420		 * only reference. If so, just free the buffer.  If not,
421		 * complain and get out.
422		 */
423		if (flags == DB_MPOOL_FREE) {
424freebuf:		MUTEX_LOCK(env, hp->mtx_hash);
425			h_locked = 1;
426			if (F_ISSET(bhp, BH_DIRTY)) {
427				F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
428				DB_ASSERT(env,
429				   atomic_read(&hp->hash_page_dirty) > 0);
430				atomic_dec(env, &hp->hash_page_dirty);
431			}
432
433			/*
434			 * If the buffer we found is already freed, we're done.
435			 * If the ref count is not 1 then someone may be
436			 * peeking at the buffer.  We cannot free it until they
437			 * determine that it is not what they want.  Clear the
438			 * buffer so that waiting threads get an empty page.
439			 */
440			if (F_ISSET(bhp, BH_FREED))
441				goto done;
442			else if (F_ISSET(bhp, BH_FROZEN))
443				makecopy = 1;
444
445			if (makecopy)
446				break;
447			else if (BH_REFCOUNT(bhp) != 1 ||
448			    !SH_CHAIN_SINGLETON(bhp, vc)) {
449				/*
450				 * Create an empty page in the chain for
451				 * subsequent gets.  Otherwise, a thread that
452				 * re-creates this page while it is still in
453				 * cache will see stale data.
454				 */
455				F_SET(bhp, BH_FREED);
456				F_CLR(bhp, BH_TRASH);
457			} else {
458				ret = __memp_bhfree(dbmp, infop, mfp,
459				    hp, bhp, BH_FREE_FREEMEM);
460				bhp = NULL;
461				b_incr = b_lock = h_locked = 0;
462			}
463			goto done;
464		} else if (F_ISSET(bhp, BH_FREED)) {
465revive:			DB_ASSERT(env,
466			    flags == DB_MPOOL_CREATE || flags == DB_MPOOL_NEW);
467			makecopy = makecopy ||
468			    (mvcc && !BH_OWNED_BY(env, bhp, txn)) ||
469			    F_ISSET(bhp, BH_FROZEN);
470			if (flags == DB_MPOOL_CREATE) {
471				MUTEX_LOCK(env, mfp->mutex);
472				if (*pgnoaddr > mfp->last_pgno)
473					mfp->last_pgno = *pgnoaddr;
474				MUTEX_UNLOCK(env, mfp->mutex);
475			}
476		}
477		if (mvcc) {
478			/*
479			 * With multiversion databases, we might need to
480			 * allocate a new buffer into which we can copy the one
481			 * that we found.  In that case, check the last buffer
482			 * in the chain to see whether we can reuse an obsolete
483			 * buffer.
484			 *
485			 * To provide snapshot isolation, we need to make sure
486			 * that we've seen a buffer older than the oldest
487			 * snapshot read LSN.
488			 */
489reuse:			if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
490			    !h_locked) {
491				MUTEX_LOCK(env, hp->mtx_hash);
492				h_locked = 1;
493			}
494			if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
495			    SH_CHAIN_HASPREV(bhp, vc)) {
496				oldest_bhp = SH_CHAIN_PREVP(bhp, vc, __bh);
497				while (SH_CHAIN_HASPREV(oldest_bhp, vc))
498					oldest_bhp = SH_CHAIN_PREVP(
499					    oldest_bhp, vc, __bh);
500
501				if (BH_REFCOUNT(oldest_bhp) == 0 &&
502				    !BH_OBSOLETE(
503				    oldest_bhp, hp->old_reader, vlsn) &&
504				    (ret = __txn_oldest_reader(env,
505				    &hp->old_reader)) != 0)
506					goto err;
507
508				if (BH_OBSOLETE(
509				    oldest_bhp, hp->old_reader, vlsn) &&
510				    BH_REFCOUNT(oldest_bhp) == 0) {
511					DB_ASSERT(env,
512					    !F_ISSET(oldest_bhp, BH_DIRTY));
513					atomic_inc(env, &oldest_bhp->ref);
514					if (F_ISSET(oldest_bhp, BH_FROZEN)) {
515						/*
516						 * This call will release the
517						 * hash bucket mutex.
518						 */
519						ret = __memp_bh_thaw(dbmp,
520						    infop, hp, oldest_bhp,
521						    NULL);
522						h_locked = 0;
523						if (ret != 0)
524							goto err;
525						goto reuse;
526					}
527					if ((ret = __memp_bhfree(dbmp,
528					    infop, mfp, hp, oldest_bhp,
529					    BH_FREE_REUSE)) != 0)
530						goto err;
531					alloc_bhp = oldest_bhp;
532					h_locked = 0;
533				}
534
535				DB_ASSERT(env, alloc_bhp == NULL ||
536				    !F_ISSET(alloc_bhp, BH_FROZEN));
537			}
538		}
539
540		/* We found the buffer or we're ready to copy -- we're done. */
541		if (!(makecopy || F_ISSET(bhp, BH_FROZEN)) || alloc_bhp != NULL)
542			break;
543
544		/* FALLTHROUGH */
545	case FIRST_MISS:
546		/*
547		 * We didn't find the buffer in our first check.  Figure out
548		 * if the page exists, and allocate structures so we can add
549		 * the page to the buffer pool.
550		 */
551		if (h_locked)
552			MUTEX_UNLOCK(env, hp->mtx_hash);
553		h_locked = 0;
554
555		/*
556		 * The buffer is not in the pool, so we don't need to free it.
557		 */
558		if (LF_ISSET(DB_MPOOL_FREE) &&
559		    (bhp == NULL || F_ISSET(bhp, BH_FREED) || !makecopy))
560			goto done;
561
562		if (bhp != NULL)
563			goto alloc;
564
565newpg:		/*
566		 * If DB_MPOOL_NEW is set, we have to allocate a page number.
567		 * If neither DB_MPOOL_CREATE or DB_MPOOL_NEW is set, then
568		 * it's an error to try and get a page past the end of file.
569		 */
570		DB_ASSERT(env, !h_locked);
571		MUTEX_LOCK(env, mfp->mutex);
572		switch (flags) {
573		case DB_MPOOL_NEW:
574			extending = 1;
575			if (mfp->maxpgno != 0 &&
576			    mfp->last_pgno >= mfp->maxpgno) {
577				__db_errx(env, "%s: file limited to %lu pages",
578				    __memp_fn(dbmfp), (u_long)mfp->maxpgno);
579				ret = ENOSPC;
580			} else
581				*pgnoaddr = mfp->last_pgno + 1;
582			break;
583		case DB_MPOOL_CREATE:
584			if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) {
585				__db_errx(env, "%s: file limited to %lu pages",
586				    __memp_fn(dbmfp), (u_long)mfp->maxpgno);
587				ret = ENOSPC;
588			} else if (!extending)
589				extending = *pgnoaddr > mfp->last_pgno;
590			break;
591		default:
592			ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
593			break;
594		}
595		MUTEX_UNLOCK(env, mfp->mutex);
596		if (ret != 0)
597			goto err;
598
599		/*
600		 * !!!
601		 * In the DB_MPOOL_NEW code path, hp, infop and c_mp have
602		 * not yet been initialized.
603		 */
604		if (hp == NULL) {
605			MP_GET_BUCKET(env,
606			    mfp, *pgnoaddr, &infop, hp, bucket, ret);
607			if (ret != 0)
608				goto err;
609			MUTEX_UNLOCK(env, hp->mtx_hash);
610			c_mp = infop->primary;
611		}
612
613alloc:		/* Allocate a new buffer header and data space. */
614		if (alloc_bhp == NULL && (ret =
615		    __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0)
616			goto err;
617
618		/* Initialize enough so we can call __memp_bhfree. */
619		alloc_bhp->flags = 0;
620		atomic_init(&alloc_bhp->ref, 1);
621#ifdef DIAGNOSTIC
622		if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
623			__db_errx(env,
624		    "DB_MPOOLFILE->get: buffer data is NOT size_t aligned");
625			ret = __env_panic(env, EINVAL);
626			goto err;
627		}
628#endif
629
630		/*
631		 * If we're doing copy-on-write, we will already have the
632		 * buffer header.  In that case, we don't need to search again.
633		 */
634		if (bhp != NULL)
635			break;
636
637		/*
638		 * If we are extending the file, we'll need the mfp lock
639		 * again.
640		 */
641		if (extending)
642			MUTEX_LOCK(env, mfp->mutex);
643
644		/*
645		 * DB_MPOOL_NEW does not guarantee you a page unreferenced by
646		 * any other thread of control.  (That guarantee is interesting
647		 * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
648		 * did not specify the page number, and so, may reasonably not
649		 * have any way to lock the page outside of mpool.) Regardless,
650		 * if we allocate the page, and some other thread of control
651		 * requests the page by number, we will not detect that and the
652		 * thread of control that allocated using DB_MPOOL_NEW may not
653		 * have a chance to initialize the page.  (Note: we *could*
654		 * detect this case if we set a flag in the buffer header which
655		 * guaranteed that no gets of the page would succeed until the
656		 * reference count went to 0, that is, until the creating page
657		 * put the page.)  What we do guarantee is that if two threads
658		 * of control are both doing DB_MPOOL_NEW calls, they won't
659		 * collide, that is, they won't both get the same page.
660		 *
661		 * There's a possibility that another thread allocated the page
662		 * we were planning to allocate while we were off doing buffer
663		 * allocation.  We can do that by making sure the page number
664		 * we were going to use is still available.  If it's not, then
665		 * we check to see if the next available page number hashes to
666		 * the same mpool region as the old one -- if it does, we can
667		 * continue, otherwise, we have to start over.
668		 */
669		if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
670			*pgnoaddr = mfp->last_pgno + 1;
671			MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop, ret);
672			if (ret != 0)
673				goto err;
674			if (t_infop != infop) {
675				/*
676				 * flags == DB_MPOOL_NEW, so extending is set
677				 * and we're holding the mfp locked.
678				 */
679				MUTEX_UNLOCK(env, mfp->mutex);
680				goto newpg;
681			}
682		}
683
684		/*
685		 * We released the mfp lock, so another thread might have
686		 * extended the file.  Update the last_pgno and initialize
687		 * the file, as necessary, if we extended the file.
688		 */
689		if (extending) {
690			if (*pgnoaddr > mfp->last_pgno)
691				mfp->last_pgno = *pgnoaddr;
692			MUTEX_UNLOCK(env, mfp->mutex);
693			if (ret != 0)
694				goto err;
695		}
696		goto retry;
697	case SECOND_FOUND:
698		/*
699		 * We allocated buffer space for the requested page, but then
700		 * found the page in the buffer cache on our second check.
701		 * That's OK -- we can use the page we found in the pool,
702		 * unless DB_MPOOL_NEW is set.  If we're about to copy-on-write,
703		 * this is exactly the situation we want.
704		 *
705		 * For multiversion files, we may have left some pages in cache
706		 * beyond the end of a file after truncating.  In that case, we
707		 * would get to here with extending set.  If so, we need to
708		 * insert the new page in the version chain similar to when
709		 * we copy on write.
710		 */
711		if (F_ISSET(bhp, BH_FREED) &&
712		    (flags == DB_MPOOL_NEW || flags == DB_MPOOL_CREATE))
713			goto revive;
714		else if (flags == DB_MPOOL_FREE)
715			goto freebuf;
716		else if (makecopy || F_ISSET(bhp, BH_FROZEN))
717			break;
718
719		/*
720		 * We can't use the page we found in the pool if DB_MPOOL_NEW
721		 * was set.  (For details, see the above comment beginning
722		 * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
723		 * any other thread of control".)  If DB_MPOOL_NEW is set, we
724		 * release our pin on this particular buffer, and try to get
725		 * another one.
726		 */
727		if (flags == DB_MPOOL_NEW) {
728			DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) != 0);
729			atomic_dec(env, &bhp->ref);
730			b_incr = 0;
731			if (F_ISSET(bhp, BH_EXCLUSIVE))
732				F_CLR(bhp, BH_EXCLUSIVE);
733			MUTEX_UNLOCK(env, bhp->mtx_buf);
734			b_lock = 0;
735			bhp = NULL;
736			goto newpg;
737		}
738
739		break;
740	case SECOND_MISS:
741		/*
742		 * We allocated buffer space for the requested page, and found
743		 * the page still missing on our second pass through the buffer
744		 * cache.  Instantiate the page.
745		 */
746		DB_ASSERT(env, alloc_bhp != NULL);
747		bhp = alloc_bhp;
748		alloc_bhp = NULL;
749
750		/*
751		 * Initialize all the BH and hash bucket fields so we can call
752		 * __memp_bhfree if an error occurs.
753		 *
754		 * Append the buffer to the tail of the bucket list.
755		 */
756		bhp->priority = UINT32_MAX;
757		bhp->pgno = *pgnoaddr;
758		bhp->mf_offset = mf_offset;
759		bhp->bucket = bucket;
760		bhp->region = (int)(infop - dbmp->reginfo);
761		bhp->td_off = INVALID_ROFF;
762		SH_CHAIN_INIT(bhp, vc);
763		bhp->flags = 0;
764
765		/*
766		 * Reference the buffer and lock exclusive.  We either
767		 * need to read the buffer or create it from scratch
768		 * and don't want anyone looking at it till we do.
769		 */
770		MUTEX_LOCK(env, bhp->mtx_buf);
771		b_lock = 1;
772		F_SET(bhp, BH_EXCLUSIVE);
773		b_incr = 1;
774
775		/* We created a new page, it starts dirty. */
776		if (extending) {
777			atomic_inc(env, &hp->hash_page_dirty);
778			F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
779		}
780
781		MUTEX_REQUIRED(env, hp->mtx_hash);
782		SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, bhp, hq, __bh);
783		MUTEX_UNLOCK(env, hp->mtx_hash);
784		h_locked = 0;
785
786		/*
787		 * If we created the page, zero it out.  If we didn't create
788		 * the page, read from the backing file.
789		 *
790		 * !!!
791		 * DB_MPOOL_NEW doesn't call the pgin function.
792		 *
793		 * If DB_MPOOL_CREATE is used, then the application's pgin
794		 * function has to be able to handle pages of 0's -- if it
795		 * uses DB_MPOOL_NEW, it can detect all of its page creates,
796		 * and not bother.
797		 *
798		 * If we're running in diagnostic mode, smash any bytes on the
799		 * page that are unknown quantities for the caller.
800		 *
801		 * Otherwise, read the page into memory, optionally creating it
802		 * if DB_MPOOL_CREATE is set.
803		 */
804		if (extending) {
805			MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
806			    PROT_READ | PROT_WRITE);
807			memset(bhp->buf, 0,
808			    (mfp->clear_len == DB_CLEARLEN_NOTSET) ?
809			    mfp->stat.st_pagesize : mfp->clear_len);
810#if defined(DIAGNOSTIC) || defined(UMRW)
811			if (mfp->clear_len != DB_CLEARLEN_NOTSET)
812				memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
813				    mfp->stat.st_pagesize - mfp->clear_len);
814#endif
815
816			if (flags == DB_MPOOL_CREATE && mfp->ftype != 0 &&
817			    (ret = __memp_pg(dbmfp,
818			    bhp->pgno, bhp->buf, 1)) != 0)
819				goto err;
820
821			STAT(++mfp->stat.st_page_create);
822		} else {
823			F_SET(bhp, BH_TRASH);
824			STAT(++mfp->stat.st_cache_miss);
825		}
826
827		makecopy = mvcc && dirty && !extending;
828
829		/* Increment buffer count referenced by MPOOLFILE. */
830		MUTEX_LOCK(env, mfp->mutex);
831		++mfp->block_cnt;
832		MUTEX_UNLOCK(env, mfp->mutex);
833	}
834
835	DB_ASSERT(env, bhp != NULL && BH_REFCOUNT(bhp) != 0 && b_lock);
836	DB_ASSERT(env, !F_ISSET(bhp, BH_FROZEN) || !F_ISSET(bhp, BH_FREED) ||
837	    makecopy);
838
839	/* We've got a buffer header we're re-instantiating. */
840	if (F_ISSET(bhp, BH_FROZEN) && !F_ISSET(bhp, BH_FREED)) {
841		if (alloc_bhp == NULL)
842			goto reuse;
843
844		/*
845		 * To thaw the buffer, we must hold the hash bucket mutex,
846		 * and the call to __memp_bh_thaw will release it.
847		 */
848		if (h_locked == 0)
849			MUTEX_LOCK(env, hp->mtx_hash);
850		h_locked = 1;
851
852		/*
853		 * If the empty buffer has been filled in the meantime, don't
854		 * overwrite it.
855		 */
856		if (F_ISSET(bhp, BH_THAWED)) {
857			MUTEX_UNLOCK(env, hp->mtx_hash);
858			h_locked = 0;
859			goto thawed;
860		}
861
862		ret = __memp_bh_thaw(dbmp, infop, hp, bhp, alloc_bhp);
863		bhp = NULL;
864		b_lock = h_locked = 0;
865		if (ret != 0)
866			goto err;
867		bhp = alloc_bhp;
868		alloc_bhp = NULL;
869		MUTEX_REQUIRED(env, bhp->mtx_buf);
870		b_incr = b_lock = 1;
871	}
872
873	/*
874	 * BH_TRASH --
875	 * The buffer we found may need to be filled from the disk.
876	 *
877	 * It's possible for the read function to fail, which means we fail
878	 * as well.  Discard the buffer on failure unless another thread
879	 * is waiting on our I/O to complete.  It's OK to leave the buffer
880	 * around, as the waiting thread will see the BH_TRASH flag set,
881	 * and will also attempt to discard it.  If there's a waiter,
882	 * we need to decrement our reference count.
883	 */
884	if (F_ISSET(bhp, BH_TRASH) &&
885	    flags != DB_MPOOL_FREE && !F_ISSET(bhp, BH_FREED)) {
886		if ((ret = __memp_pgread(dbmfp,
887		    bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
888			goto err;
889		DB_ASSERT(env, read_lsnp != NULL || !SH_CHAIN_HASNEXT(bhp, vc));
890	}
891
892	/* Copy-on-write. */
893	if (makecopy) {
894		/*
895		 * If we read a page from disk that we want to modify, we now
896		 * need to make copy, so we now need to allocate another buffer
897		 * to hold the new copy.
898		 */
899		if (alloc_bhp == NULL)
900			goto reuse;
901
902		DB_ASSERT(env, bhp != NULL && alloc_bhp != bhp);
903		DB_ASSERT(env, txn != NULL ||
904		    (F_ISSET(bhp, BH_FROZEN) && F_ISSET(bhp, BH_FREED)));
905		DB_ASSERT(env, (extending || flags == DB_MPOOL_FREE ||
906		    F_ISSET(bhp, BH_FREED)) ||
907		    !F_ISSET(bhp, BH_FROZEN | BH_TRASH));
908		MUTEX_REQUIRED(env, bhp->mtx_buf);
909
910		if (BH_REFCOUNT(bhp) == 1)
911			MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
912			    PROT_READ);
913
914		atomic_init(&alloc_bhp->ref, 1);
915		MUTEX_LOCK(env, alloc_bhp->mtx_buf);
916		alloc_bhp->priority = bhp->priority;
917		alloc_bhp->pgno = bhp->pgno;
918		alloc_bhp->bucket = bhp->bucket;
919		alloc_bhp->region = bhp->region;
920		alloc_bhp->mf_offset = bhp->mf_offset;
921		alloc_bhp->td_off = INVALID_ROFF;
922		if (txn == NULL) {
923			DB_ASSERT(env,
924			    F_ISSET(bhp, BH_FROZEN) && F_ISSET(bhp, BH_FREED));
925			if (bhp->td_off != INVALID_ROFF && (ret =
926			    __memp_bh_settxn(dbmp, mfp, alloc_bhp,
927			    BH_OWNER(env, bhp))) != 0)
928				goto err;
929		} else if ((ret =
930		    __memp_bh_settxn(dbmp, mfp, alloc_bhp, td)) != 0)
931			goto err;
932		MVCC_MPROTECT(alloc_bhp->buf, mfp->stat.st_pagesize,
933		    PROT_READ | PROT_WRITE);
934		if (extending ||
935		    F_ISSET(bhp, BH_FREED) || flags == DB_MPOOL_FREE) {
936			memset(alloc_bhp->buf, 0,
937			    (mfp->clear_len == DB_CLEARLEN_NOTSET) ?
938			    mfp->stat.st_pagesize : mfp->clear_len);
939#if defined(DIAGNOSTIC) || defined(UMRW)
940			if (mfp->clear_len != DB_CLEARLEN_NOTSET)
941				memset(alloc_bhp->buf + mfp->clear_len,
942				    CLEAR_BYTE,
943				    mfp->stat.st_pagesize - mfp->clear_len);
944#endif
945		} else
946			memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize);
947		MVCC_MPROTECT(alloc_bhp->buf, mfp->stat.st_pagesize, 0);
948
949		if (h_locked == 0)
950			MUTEX_LOCK(env, hp->mtx_hash);
951		MUTEX_REQUIRED(env, hp->mtx_hash);
952		h_locked = 1;
953
954		alloc_bhp->flags = BH_EXCLUSIVE |
955		    ((flags == DB_MPOOL_FREE) ? BH_FREED :
956		    F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE));
957		DB_ASSERT(env, flags != DB_MPOOL_FREE ||
958		    !F_ISSET(bhp, BH_DIRTY));
959		F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
960		DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
961		SH_CHAIN_INSERT_AFTER(bhp, alloc_bhp, vc, __bh);
962		SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket,
963		    bhp, alloc_bhp, hq, __bh);
964		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
965		MUTEX_UNLOCK(env, hp->mtx_hash);
966		h_locked = 0;
967		DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) > 0);
968		if (atomic_dec(env, &bhp->ref) == 0) {
969			bhp->priority = c_mp->lru_count;
970			MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0);
971		}
972		F_CLR(bhp, BH_EXCLUSIVE);
973		MUTEX_UNLOCK(env, bhp->mtx_buf);
974
975		bhp = alloc_bhp;
976		DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
977		b_incr = 1;
978		MUTEX_REQUIRED(env, bhp->mtx_buf);
979		b_lock = 1;
980
981		if (alloc_bhp != oldest_bhp) {
982			MUTEX_LOCK(env, mfp->mutex);
983			++mfp->block_cnt;
984			MUTEX_UNLOCK(env, mfp->mutex);
985		}
986
987		alloc_bhp = NULL;
988	} else if (mvcc && extending &&
989	    (ret = __memp_bh_settxn(dbmp, mfp, bhp, td)) != 0)
990		goto err;
991
992	if (flags == DB_MPOOL_FREE) {
993		DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
994		/* If we have created an empty buffer, it is not returned. */
995		if (!F_ISSET(bhp, BH_FREED))
996			goto freebuf;
997		goto done;
998	}
999
1000	/*
1001	 * Free the allocated memory, we no longer need it.
1002	 */
1003	if (alloc_bhp != NULL) {
1004		if ((ret = __memp_bhfree(dbmp, infop, NULL,
1005		     NULL, alloc_bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED)) != 0)
1006			goto err;
1007		alloc_bhp = NULL;
1008	}
1009
1010	if (dirty || extending ||
1011	    (F_ISSET(bhp, BH_FREED) &&
1012	    (flags == DB_MPOOL_CREATE || flags == DB_MPOOL_NEW))) {
1013		MUTEX_REQUIRED(env, bhp->mtx_buf);
1014		if (F_ISSET(bhp, BH_FREED)) {
1015			memset(bhp->buf, 0,
1016			    (mfp->clear_len == DB_CLEARLEN_NOTSET) ?
1017			    mfp->stat.st_pagesize : mfp->clear_len);
1018			F_CLR(bhp, BH_FREED);
1019		}
1020		if (!F_ISSET(bhp, BH_DIRTY)) {
1021#ifdef DIAGNOSTIC
1022			MUTEX_LOCK(env, hp->mtx_hash);
1023#endif
1024			DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
1025			atomic_inc(env, &hp->hash_page_dirty);
1026			F_SET(bhp, BH_DIRTY);
1027#ifdef DIAGNOSTIC
1028			MUTEX_UNLOCK(env, hp->mtx_hash);
1029#endif
1030		}
1031	} else if (F_ISSET(bhp, BH_EXCLUSIVE)) {
1032		F_CLR(bhp, BH_EXCLUSIVE);
1033#ifdef HAVE_SHARED_LATCHES
1034		MUTEX_UNLOCK(env, bhp->mtx_buf);
1035		MUTEX_READLOCK(env, bhp->mtx_buf);
1036		/*
1037		 * If another thread has dirtied the page while we
1038		 * switched locks, we have to go through it all again.
1039		 */
1040		if (SH_CHAIN_HASNEXT(bhp, vc) && read_lsnp == NULL) {
1041			atomic_dec(env, &bhp->ref);
1042			b_incr = 0;
1043			MUTEX_UNLOCK(env, bhp->mtx_buf);
1044			b_lock = 0;
1045			bhp = NULL;
1046			goto retry;
1047		}
1048#endif
1049	}
1050
1051	MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ |
1052	    (dirty || extending || F_ISSET(bhp, BH_DIRTY) ?
1053	    PROT_WRITE : 0));
1054
1055#ifdef DIAGNOSTIC
1056	MUTEX_LOCK(env, hp->mtx_hash);
1057	{
1058	BH *next_bhp = SH_CHAIN_NEXT(bhp, vc, __bh);
1059
1060	DB_ASSERT(env, !mfp->multiversion || read_lsnp != NULL ||
1061	    next_bhp == NULL);
1062	DB_ASSERT(env, !mvcc || read_lsnp == NULL ||
1063	    bhp->td_off == INVALID_ROFF || BH_OWNED_BY(env, bhp, txn) ||
1064	    (BH_VISIBLE(env, bhp, read_lsnp, vlsn) &&
1065	    (next_bhp == NULL || F_ISSET(next_bhp, BH_FROZEN) ||
1066	    (next_bhp->td_off != INVALID_ROFF &&
1067	    (BH_OWNER(env, next_bhp)->status != TXN_COMMITTED ||
1068	    IS_ZERO_LSN(BH_OWNER(env, next_bhp)->last_lsn) ||
1069	    !BH_VISIBLE(env, next_bhp, read_lsnp, vlsn))))));
1070	}
1071	MUTEX_UNLOCK(env, hp->mtx_hash);
1072#endif
1073
1074	/*
1075	 * Record this pin for this thread.  Holding the page pinned
1076	 * without recording the pin is ok since we do not recover from
1077	 * a death from within the library itself.
1078	 */
1079	if (ip != NULL) {
1080		reginfo = env->reginfo;
1081		if (ip->dbth_pincount == ip->dbth_pinmax) {
1082			pinmax = ip->dbth_pinmax;
1083			renv = reginfo->primary;
1084			MUTEX_LOCK(env, renv->mtx_regenv);
1085			if ((ret = __env_alloc(reginfo,
1086			    2 * pinmax * sizeof(PIN_LIST), &list)) != 0) {
1087				MUTEX_UNLOCK(env, renv->mtx_regenv);
1088				goto err;
1089			}
1090
1091			memcpy(list, R_ADDR(reginfo, ip->dbth_pinlist),
1092			    pinmax * sizeof(PIN_LIST));
1093			memset(&list[pinmax], 0, pinmax * sizeof(PIN_LIST));
1094			list_off = R_OFFSET(reginfo, list);
1095			list = R_ADDR(reginfo, ip->dbth_pinlist);
1096			ip->dbth_pinmax = 2 * pinmax;
1097			ip->dbth_pinlist = list_off;
1098			if (list != ip->dbth_pinarray)
1099				__env_alloc_free(reginfo, list);
1100			MUTEX_UNLOCK(env, renv->mtx_regenv);
1101		}
1102		list = R_ADDR(reginfo, ip->dbth_pinlist);
1103		for (lp = list; lp < &list[ip->dbth_pinmax]; lp++)
1104			if (lp->b_ref == INVALID_ROFF)
1105				break;
1106
1107		ip->dbth_pincount++;
1108		lp->b_ref = R_OFFSET(infop, bhp);
1109		lp->region = (int)(infop - dbmp->reginfo);
1110	}
1111
1112#ifdef DIAGNOSTIC
1113	/* Update the file's pinned reference count. */
1114	MPOOL_SYSTEM_LOCK(env);
1115	++dbmfp->pinref;
1116	MPOOL_SYSTEM_UNLOCK(env);
1117
1118	/*
1119	 * We want to switch threads as often as possible, and at awkward
1120	 * times.  Yield every time we get a new page to ensure contention.
1121	 */
1122	if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU))
1123		__os_yield(env, 0, 0);
1124#endif
1125
1126	DB_ASSERT(env, alloc_bhp == NULL);
1127	DB_ASSERT(env, !(dirty || extending) ||
1128	    atomic_read(&hp->hash_page_dirty) > 0);
1129	DB_ASSERT(env, BH_REFCOUNT(bhp) > 0 &&
1130	    !F_ISSET(bhp, BH_FREED | BH_FROZEN | BH_TRASH));
1131
1132	*(void **)addrp = bhp->buf;
1133	return (0);
1134
1135done:
1136err:	/*
1137	 * We should only get to here with ret == 0 if freeing a buffer.
1138	 * In that case, check that it has in fact been freed.
1139	 */
1140	DB_ASSERT(env, ret != 0 || flags != DB_MPOOL_FREE || bhp == NULL ||
1141	    (F_ISSET(bhp, BH_FREED) && !SH_CHAIN_HASNEXT(bhp, vc)));
1142
1143	if (bhp != NULL) {
1144		if (b_incr)
1145			atomic_dec(env, &bhp->ref);
1146		if (b_lock) {
1147			F_CLR(bhp, BH_EXCLUSIVE);
1148			MUTEX_UNLOCK(env, bhp->mtx_buf);
1149		}
1150	}
1151
1152	if (h_locked)
1153		MUTEX_UNLOCK(env, hp->mtx_hash);
1154
1155	/* If alloc_bhp is set, free the memory. */
1156	if (alloc_bhp != NULL)
1157		(void)__memp_bhfree(dbmp, infop, NULL,
1158		     NULL, alloc_bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED);
1159
1160	return (ret);
1161}
1162