1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: mp_bh.c,v 12.43 2008/01/08 20:58:42 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"		/* Required for diagnostic code. */
13#include "dbinc/mp.h"
14#include "dbinc/log.h"
15#include "dbinc/txn.h"
16
17static int __memp_pgwrite
18	       __P((ENV *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *));
19
20/*
21 * __memp_bhwrite --
22 *	Write the page associated with a given buffer header.
23 *
24 * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *,
25 * PUBLIC:      DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
26 */
27int
28__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
29	DB_MPOOL *dbmp;
30	DB_MPOOL_HASH *hp;
31	MPOOLFILE *mfp;
32	BH *bhp;
33	int open_extents;
34{
35	DB_MPOOLFILE *dbmfp;
36	DB_MPREG *mpreg;
37	ENV *env;
38	int ret;
39
40	env = dbmp->env;
41
42	/*
43	 * If the file has been removed or is a closed temporary file, we're
44	 * done -- the page-write function knows how to handle the fact that
45	 * we don't have (or need!) any real file descriptor information.
46	 */
47	if (mfp->deadfile)
48		return (__memp_pgwrite(env, NULL, hp, bhp));
49
50	/*
51	 * Walk the process' DB_MPOOLFILE list and find a file descriptor for
52	 * the file.  We also check that the descriptor is open for writing.
53	 */
54	MUTEX_LOCK(env, dbmp->mutex);
55	TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q)
56		if (dbmfp->mfp == mfp && !F_ISSET(dbmfp, MP_READONLY)) {
57			++dbmfp->ref;
58			break;
59		}
60	MUTEX_UNLOCK(env, dbmp->mutex);
61
62	if (dbmfp != NULL) {
63		/*
64		 * Temporary files may not have been created.  We only handle
65		 * temporary files in this path, because only the process that
66		 * created a temporary file will ever flush buffers to it.
67		 */
68		if (dbmfp->fhp == NULL) {
69			/* We may not be allowed to create backing files. */
70			if (mfp->no_backing_file) {
71				--dbmfp->ref;
72				return (EPERM);
73			}
74
75			MUTEX_LOCK(env, dbmp->mutex);
76			if (dbmfp->fhp == NULL)
77				ret = __db_appname(env, DB_APP_TMP, NULL,
78				    F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ?
79				    DB_OSO_DIRECT : 0, &dbmfp->fhp, NULL);
80			else
81				ret = 0;
82			MUTEX_UNLOCK(env, dbmp->mutex);
83			if (ret != 0) {
84				__db_errx(env,
85				    "unable to create temporary backing file");
86				--dbmfp->ref;
87				return (ret);
88			}
89		}
90
91		goto pgwrite;
92	}
93
94	/*
95	 * There's no file handle for this file in our process.
96	 *
97	 * !!!
98	 * It's the caller's choice if we're going to open extent files.
99	 */
100	if (!open_extents && F_ISSET(mfp, MP_EXTENT))
101		return (EPERM);
102
103	/*
104	 * !!!
105	 * Don't try to attach to temporary files.  There are two problems in
106	 * trying to do that.  First, if we have different privileges than the
107	 * process that "owns" the temporary file, we might create the backing
108	 * disk file such that the owning process couldn't read/write its own
109	 * buffers, e.g., memp_trickle running as root creating a file owned
110	 * as root, mode 600.  Second, if the temporary file has already been
111	 * created, we don't have any way of finding out what its real name is,
112	 * and, even if we did, it was already unlinked (so that it won't be
113	 * left if the process dies horribly).  This decision causes a problem,
114	 * however: if the temporary file consumes the entire buffer cache,
115	 * and the owner doesn't flush the buffers to disk, we could end up
116	 * with resource starvation, and the memp_trickle thread couldn't do
117	 * anything about it.  That's a pretty unlikely scenario, though.
118	 *
119	 * Note we should never get here when the temporary file in question
120	 * has already been closed in another process, in which case it should
121	 * be marked dead.
122	 */
123	if (F_ISSET(mfp, MP_TEMP) || mfp->no_backing_file)
124		return (EPERM);
125
126	/*
127	 * It's not a page from a file we've opened.  If the file requires
128	 * application-specific input/output processing, see if this process
129	 * has ever registered information as to how to write this type of
130	 * file.  If not, there's nothing we can do.
131	 */
132	if (mfp->ftype != 0 && mfp->ftype != DB_FTYPE_SET) {
133		MUTEX_LOCK(env, dbmp->mutex);
134		LIST_FOREACH(mpreg, &dbmp->dbregq, q)
135			if (mpreg->ftype == mfp->ftype)
136				break;
137		MUTEX_UNLOCK(env, dbmp->mutex);
138		if (mpreg == NULL)
139			return (EPERM);
140	}
141
142	/*
143	 * Try and open the file, specifying the known underlying shared area.
144	 *
145	 * !!!
146	 * There's no negative cache, so we may repeatedly try and open files
147	 * that we have previously tried (and failed) to open.
148	 */
149	if ((ret = __memp_fcreate(env, &dbmfp)) != 0)
150		return (ret);
151	if ((ret = __memp_fopen(dbmfp,
152	    mfp, NULL, DB_DURABLE_UNKNOWN, 0, mfp->stat.st_pagesize)) != 0) {
153		(void)__memp_fclose(dbmfp, 0);
154
155		/*
156		 * Ignore any error if the file is marked dead, assume the file
157		 * was removed from under us.
158		 */
159		if (!mfp->deadfile)
160			return (ret);
161
162		dbmfp = NULL;
163	}
164
165pgwrite:
166	MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
167	    PROT_READ | PROT_WRITE | PROT_EXEC);
168	ret = __memp_pgwrite(env, dbmfp, hp, bhp);
169	if (dbmfp == NULL)
170		return (ret);
171
172	/*
173	 * Discard our reference, and, if we're the last reference, make sure
174	 * the file eventually gets closed.
175	 */
176	MUTEX_LOCK(env, dbmp->mutex);
177	if (dbmfp->ref == 1)
178		F_SET(dbmfp, MP_FLUSH);
179	else
180		--dbmfp->ref;
181	MUTEX_UNLOCK(env, dbmp->mutex);
182
183	return (ret);
184}
185
186/*
187 * __memp_pgread --
188 *	Read a page from a file.
189 *
190 * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *, int));
191 */
192int
193__memp_pgread(dbmfp, hp, bhp, can_create)
194	DB_MPOOLFILE *dbmfp;
195	DB_MPOOL_HASH *hp;
196	BH *bhp;
197	int can_create;
198{
199	ENV *env;
200	MPOOLFILE *mfp;
201	size_t len, nr;
202	u_int32_t pagesize;
203	int ret;
204
205	env = dbmfp->env;
206	mfp = dbmfp->mfp;
207	pagesize = mfp->stat.st_pagesize;
208
209	/* We should never be called with a dirty or a locked buffer. */
210	DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY_CREATE | BH_LOCKED));
211	DB_ASSERT(env, can_create || !F_ISSET(bhp, BH_DIRTY));
212
213	/* Lock the buffer and unlock the hash bucket. */
214	F_SET(bhp, BH_LOCKED | BH_TRASH);
215	MUTEX_UNLOCK(env, hp->mtx_hash);
216
217	/*
218	 * Temporary files may not yet have been created.  We don't create
219	 * them now, we create them when the pages have to be flushed.
220	 */
221	nr = 0;
222	if (dbmfp->fhp != NULL)
223		if ((ret = __os_io(env, DB_IO_READ, dbmfp->fhp,
224		    bhp->pgno, pagesize, 0, pagesize, bhp->buf, &nr)) != 0)
225			goto err;
226
227	/*
228	 * The page may not exist; if it doesn't, nr may well be 0, but we
229	 * expect the underlying OS calls not to return an error code in
230	 * this case.
231	 */
232	if (nr < pagesize) {
233		/*
234		 * Don't output error messages for short reads.  In particular,
235		 * DB recovery processing may request pages never written to
236		 * disk or for which only some part have been written to disk,
237		 * in which case we won't find the page.  The caller must know
238		 * how to handle the error.
239		 */
240		if (!can_create) {
241			ret = DB_PAGE_NOTFOUND;
242			goto err;
243		}
244
245		/* Clear any bytes that need to be cleared. */
246		len = mfp->clear_len == DB_CLEARLEN_NOTSET ?
247		    pagesize : mfp->clear_len;
248		memset(bhp->buf, 0, len);
249
250#if defined(DIAGNOSTIC) || defined(UMRW)
251		/*
252		 * If we're running in diagnostic mode, corrupt any bytes on
253		 * the page that are unknown quantities for the caller.
254		 */
255		if (len < pagesize)
256			memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
257#endif
258#ifdef HAVE_STATISTICS
259		++mfp->stat.st_page_create;
260	} else
261		++mfp->stat.st_page_in;
262#else
263	}
264#endif
265
266	/* Call any pgin function. */
267	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
268
269	/* Re-acquire the hash bucket lock. */
270err:	MUTEX_LOCK(env, hp->mtx_hash);
271
272	/*
273	 * If no errors occurred, the data is now valid, clear the BH_TRASH
274	 * flag; regardless, clear the lock bit and let other threads proceed.
275	 */
276	F_CLR(bhp, BH_LOCKED);
277	if (ret == 0)
278		F_CLR(bhp, BH_TRASH);
279
280	/*
281	 * If a thread of control is waiting on this buffer, wake it up.
282	 */
283	if (F_ISSET(hp, IO_WAITER)) {
284		F_CLR(hp, IO_WAITER);
285		MUTEX_UNLOCK(env, hp->mtx_io);
286	}
287
288	return (ret);
289}
290
291/*
292 * __memp_pgwrite --
293 *	Write a page to a file.
294 */
295static int
296__memp_pgwrite(env, dbmfp, hp, bhp)
297	ENV *env;
298	DB_MPOOLFILE *dbmfp;
299	DB_MPOOL_HASH *hp;
300	BH *bhp;
301{
302	DB_LSN lsn;
303	MPOOLFILE *mfp;
304	size_t nw;
305	int callpgin, ret;
306
307	mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
308	callpgin = ret = 0;
309
310	/* We should never be called with a clean or trash buffer. */
311	DB_ASSERT(env, F_ISSET(bhp, BH_DIRTY));
312	DB_ASSERT(env, !F_ISSET(bhp, BH_TRASH));
313
314	/*
315	 * The sync code has already locked the buffer, but the allocation
316	 * code has not.  Lock the buffer and release the hash bucket mutex.
317	 */
318	F_SET(bhp, BH_LOCKED);
319	MUTEX_UNLOCK(env, hp->mtx_hash);
320
321	/*
322	 * It's possible that the underlying file doesn't exist, either
323	 * because of an outright removal or because it was a temporary
324	 * file that's been closed.
325	 *
326	 * !!!
327	 * Once we pass this point, we know that dbmfp and mfp aren't NULL,
328	 * and that we have a valid file reference.
329	 */
330	if (mfp == NULL || mfp->deadfile)
331		goto file_dead;
332
333	/*
334	 * If the page is in a file for which we have LSN information, we have
335	 * to ensure the appropriate log records are on disk.
336	 */
337	if (LOGGING_ON(env) && mfp->lsn_off != DB_LSN_OFF_NOTSET &&
338	    !IS_CLIENT_PGRECOVER(env)) {
339		memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
340		if (!IS_NOT_LOGGED_LSN(lsn) &&
341		    (ret = __log_flush(env, &lsn)) != 0)
342			goto err;
343	}
344
345#ifdef DIAGNOSTIC
346	/*
347	 * Verify write-ahead logging semantics.
348	 *
349	 * !!!
350	 * Two special cases.  There is a single field on the meta-data page,
351	 * the last-page-number-in-the-file field, for which we do not log
352	 * changes.  If the page was originally created in a database that
353	 * didn't have logging turned on, we can see a page marked dirty but
354	 * for which no corresponding log record has been written.  However,
355	 * the only way that a page can be created for which there isn't a
356	 * previous log record and valid LSN is when the page was created
357	 * without logging turned on, and so we check for that special-case
358	 * LSN value.
359	 *
360	 * Second, when a client is reading database pages from a master
361	 * during an internal backup, we may get pages modified after
362	 * the current end-of-log.
363	 */
364	if (LOGGING_ON(env) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf)) &&
365	    !IS_CLIENT_PGRECOVER(env)) {
366		/*
367		 * There is a potential race here.  If we are in the midst of
368		 * switching log files, it's possible we could test against the
369		 * old file and the new offset in the log region's LSN.  If we
370		 * fail the first test, acquire the log mutex and check again.
371		 */
372		DB_LOG *dblp;
373		LOG *lp;
374
375		dblp = env->lg_handle;
376		lp = dblp->reginfo.primary;
377		if (!lp->db_log_inmemory &&
378		    LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) <= 0) {
379			MUTEX_LOCK(env, lp->mtx_flush);
380			DB_ASSERT(env,
381			    LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) > 0);
382			MUTEX_UNLOCK(env, lp->mtx_flush);
383		}
384	}
385#endif
386
387	/*
388	 * Call any pgout function.  We set the callpgin flag so that we flag
389	 * that the contents of the buffer will need to be passed through pgin
390	 * before they are reused.
391	 */
392	if (mfp->ftype != 0 && !F_ISSET(bhp, BH_CALLPGIN)) {
393		callpgin = 1;
394		if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
395			goto err;
396	}
397
398	/* Write the page. */
399	if ((ret = __os_io(
400	    env, DB_IO_WRITE, dbmfp->fhp, bhp->pgno, mfp->stat.st_pagesize,
401	    0, mfp->stat.st_pagesize, bhp->buf, &nw)) != 0) {
402		__db_errx(env, "%s: write failed for page %lu",
403		    __memp_fn(dbmfp), (u_long)bhp->pgno);
404		goto err;
405	}
406	STAT(++mfp->stat.st_page_out);
407	if (bhp->pgno > mfp->last_flushed_pgno) {
408		MUTEX_LOCK(env, mfp->mutex);
409		if (bhp->pgno > mfp->last_flushed_pgno)
410			mfp->last_flushed_pgno = bhp->pgno;
411		MUTEX_UNLOCK(env, mfp->mutex);
412	}
413
414err:
415file_dead:
416	/*
417	 * !!!
418	 * Once we pass this point, dbmfp and mfp may be NULL, we may not have
419	 * a valid file reference.
420	 *
421	 * Re-acquire the hash lock.
422	 */
423	MUTEX_LOCK(env, hp->mtx_hash);
424
425	/*
426	 * If we rewrote the page, it will need processing by the pgin
427	 * routine before reuse.
428	 */
429	if (callpgin)
430		F_SET(bhp, BH_CALLPGIN);
431
432	/*
433	 * Update the hash bucket statistics, reset the flags.  If we were
434	 * successful, the page is no longer dirty.
435	 */
436	if (ret == 0) {
437		DB_ASSERT(env, hp->hash_page_dirty != 0);
438		--hp->hash_page_dirty;
439		F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
440	}
441
442	/* Regardless, clear any sync wait-for count and remove our lock. */
443	bhp->ref_sync = 0;
444	F_CLR(bhp, BH_LOCKED);
445
446	/*
447	 * If a thread of control is waiting on this buffer, wake it up.
448	 */
449	if (F_ISSET(hp, IO_WAITER)) {
450		F_CLR(hp, IO_WAITER);
451		MUTEX_UNLOCK(env, hp->mtx_io);
452	}
453
454	return (ret);
455}
456
457/*
458 * __memp_pg --
459 *	Call the pgin/pgout routine.
460 *
461 * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int));
462 */
463int
464__memp_pg(dbmfp, bhp, is_pgin)
465	DB_MPOOLFILE *dbmfp;
466	BH *bhp;
467	int is_pgin;
468{
469	DBT dbt, *dbtp;
470	DB_MPOOL *dbmp;
471	DB_MPREG *mpreg;
472	ENV *env;
473	MPOOLFILE *mfp;
474	int ftype, ret;
475
476	env = dbmfp->env;
477	dbmp = env->mp_handle;
478	mfp = dbmfp->mfp;
479
480	if ((ftype = mfp->ftype) == DB_FTYPE_SET)
481		mpreg = dbmp->pg_inout;
482	else {
483		MUTEX_LOCK(env, dbmp->mutex);
484		LIST_FOREACH(mpreg, &dbmp->dbregq, q)
485			if (ftype == mpreg->ftype)
486				break;
487		MUTEX_UNLOCK(env, dbmp->mutex);
488	}
489	if (mpreg == NULL)
490		return (0);
491
492	if (mfp->pgcookie_len == 0)
493		dbtp = NULL;
494	else {
495		DB_SET_DBT(dbt, R_ADDR(
496		    dbmp->reginfo, mfp->pgcookie_off), mfp->pgcookie_len);
497		dbtp = &dbt;
498	}
499
500	if (is_pgin) {
501		if (mpreg->pgin != NULL && (ret =
502		    mpreg->pgin(env->dbenv, bhp->pgno, bhp->buf, dbtp)) != 0)
503			goto err;
504	} else
505		if (mpreg->pgout != NULL && (ret =
506		    mpreg->pgout(env->dbenv, bhp->pgno, bhp->buf, dbtp)) != 0)
507			goto err;
508
509	return (0);
510
511err:	__db_errx(env, "%s: %s failed for page %lu",
512	    __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
513	return (ret);
514}
515
516/*
517 * __memp_bhfree --
518 *	Free a bucket header and its referenced data.
519 *
520 * PUBLIC: int __memp_bhfree
521 * PUBLIC:     __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, u_int32_t));
522 */
523int
524__memp_bhfree(dbmp, infop, hp, bhp, flags)
525	DB_MPOOL *dbmp;
526	REGINFO *infop;
527	DB_MPOOL_HASH *hp;
528	BH *bhp;
529	u_int32_t flags;
530{
531	ENV *env;
532#ifdef DIAGNOSTIC
533	DB_LSN vlsn;
534#endif
535	BH *prev_bhp;
536	MPOOL *c_mp;
537	MPOOLFILE *mfp;
538	int ret, t_ret;
539#ifdef DIAG_MVCC
540	size_t pagesize;
541#endif
542
543	ret = 0;
544
545	/*
546	 * Assumes the hash bucket is locked and the MPOOL is not.
547	 */
548	env = dbmp->env;
549	mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
550#ifdef DIAG_MVCC
551	pagesize = mfp->stat.st_pagesize;
552#endif
553
554	DB_ASSERT(env, bhp->ref == 0 && !F_ISSET(bhp, BH_FROZEN));
555	DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) ||
556	    SH_CHAIN_SINGLETON(bhp, vc) ||
557	    (SH_CHAIN_HASNEXT(bhp, vc) &&
558	    SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off) ||
559	    (SH_CHAIN_HASPREV(bhp, vc) ?
560	    IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) :
561	    BH_OBSOLETE(bhp, hp->old_reader, vlsn)));
562
563	/*
564	 * Delete the buffer header from the hash bucket queue or the
565	 * version chain.
566	 */
567	prev_bhp = SH_CHAIN_PREV(bhp, vc, __bh);
568	if (SH_CHAIN_NEXT(bhp, vc, __bh) == NULL) {
569		if (prev_bhp != NULL)
570			SH_TAILQ_INSERT_AFTER(&hp->hash_bucket,
571			    bhp, prev_bhp, hq, __bh);
572		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
573	}
574	SH_CHAIN_REMOVE(bhp, vc, __bh);
575
576	/*
577	 * Remove the reference to this buffer from the transaction that
578	 * created it, if any.  When the BH_FREE_UNLOCKED flag is set, we're
579	 * discarding the environment, so the transaction region is already
580	 * gone.
581	 */
582	if (bhp->td_off != INVALID_ROFF && !LF_ISSET(BH_FREE_UNLOCKED)) {
583		ret = __txn_remove_buffer(
584		    env, BH_OWNER(env, bhp), hp->mtx_hash);
585		bhp->td_off = INVALID_ROFF;
586	}
587
588	/*
589	 * We're going to use the memory for something else -- it had better be
590	 * accessible.
591	 */
592	MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE | PROT_EXEC);
593
594	/*
595	 * If we're only removing this header from the chain for reuse, we're
596	 * done.
597	 */
598	if (LF_ISSET(BH_FREE_REUSE))
599		return (0);
600
601	/*
602	 * Discard the hash bucket's mutex, it's no longer needed, and
603	 * we don't want to be holding it when acquiring other locks.
604	 */
605	if (!LF_ISSET(BH_FREE_UNLOCKED))
606		MUTEX_UNLOCK(env, hp->mtx_hash);
607
608	/*
609	 * If we're not reusing the buffer immediately, free the buffer for
610	 * real.
611	 */
612	if (LF_ISSET(BH_FREE_FREEMEM)) {
613		MPOOL_REGION_LOCK(env, infop);
614
615		__memp_free(infop, mfp, bhp);
616		c_mp = infop->primary;
617		c_mp->stat.st_pages--;
618
619		MPOOL_REGION_UNLOCK(env, infop);
620	}
621
622	/*
623	 * Decrement the reference count of the underlying MPOOLFILE.
624	 * If this is its last reference, remove it.
625	 */
626	MUTEX_LOCK(env, mfp->mutex);
627	if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0) {
628		if ((t_ret = __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0)
629			ret = t_ret;
630	} else
631		MUTEX_UNLOCK(env, mfp->mutex);
632
633	return (ret);
634}
635