1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"		/* Required for diagnostic code. */
13#include "dbinc/mp.h"
14#include "dbinc/log.h"
15#include "dbinc/txn.h"
16
17static int __memp_pgwrite
18	       __P((ENV *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *));
19
20/*
21 * __memp_bhwrite --
22 *	Write the page associated with a given buffer header.
23 *
24 * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *,
25 * PUBLIC:      DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
26 */
27int
28__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
29	DB_MPOOL *dbmp;
30	DB_MPOOL_HASH *hp;
31	MPOOLFILE *mfp;
32	BH *bhp;
33	int open_extents;
34{
35	DB_MPOOLFILE *dbmfp;
36	DB_MPREG *mpreg;
37	ENV *env;
38	int ret;
39
40	env = dbmp->env;
41
42	/*
43	 * If the file has been removed or is a closed temporary file, we're
44	 * done -- the page-write function knows how to handle the fact that
45	 * we don't have (or need!) any real file descriptor information.
46	 */
47	if (mfp->deadfile)
48		return (__memp_pgwrite(env, NULL, hp, bhp));
49
50	/*
51	 * Walk the process' DB_MPOOLFILE list and find a file descriptor for
52	 * the file.  We also check that the descriptor is open for writing.
53	 */
54	MUTEX_LOCK(env, dbmp->mutex);
55	TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q)
56		if (dbmfp->mfp == mfp && !F_ISSET(dbmfp, MP_READONLY)) {
57			++dbmfp->ref;
58			break;
59		}
60	MUTEX_UNLOCK(env, dbmp->mutex);
61
62	if (dbmfp != NULL) {
63		/*
64		 * Temporary files may not have been created.  We only handle
65		 * temporary files in this path, because only the process that
66		 * created a temporary file will ever flush buffers to it.
67		 */
68		if (dbmfp->fhp == NULL) {
69			/* We may not be allowed to create backing files. */
70			if (mfp->no_backing_file) {
71				--dbmfp->ref;
72				return (EPERM);
73			}
74
75			MUTEX_LOCK(env, dbmp->mutex);
76			if (dbmfp->fhp == NULL) {
77				ret = __db_tmp_open(env,
78				    F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ?
79				    DB_OSO_DIRECT : 0, &dbmfp->fhp);
80			} else
81				ret = 0;
82			MUTEX_UNLOCK(env, dbmp->mutex);
83			if (ret != 0) {
84				__db_errx(env,
85				    "unable to create temporary backing file");
86				--dbmfp->ref;
87				return (ret);
88			}
89		}
90
91		goto pgwrite;
92	}
93
94	/*
95	 * There's no file handle for this file in our process.
96	 *
97	 * !!!
98	 * It's the caller's choice if we're going to open extent files.
99	 */
100	if (!open_extents && F_ISSET(mfp, MP_EXTENT))
101		return (EPERM);
102
103	/*
104	 * !!!
105	 * Don't try to attach to temporary files.  There are two problems in
106	 * trying to do that.  First, if we have different privileges than the
107	 * process that "owns" the temporary file, we might create the backing
108	 * disk file such that the owning process couldn't read/write its own
109	 * buffers, e.g., memp_trickle running as root creating a file owned
110	 * as root, mode 600.  Second, if the temporary file has already been
111	 * created, we don't have any way of finding out what its real name is,
112	 * and, even if we did, it was already unlinked (so that it won't be
113	 * left if the process dies horribly).  This decision causes a problem,
114	 * however: if the temporary file consumes the entire buffer cache,
115	 * and the owner doesn't flush the buffers to disk, we could end up
116	 * with resource starvation, and the memp_trickle thread couldn't do
117	 * anything about it.  That's a pretty unlikely scenario, though.
118	 *
119	 * Note we should never get here when the temporary file in question
120	 * has already been closed in another process, in which case it should
121	 * be marked dead.
122	 */
123	if (F_ISSET(mfp, MP_TEMP) || mfp->no_backing_file)
124		return (EPERM);
125
126	/*
127	 * It's not a page from a file we've opened.  If the file requires
128	 * application-specific input/output processing, see if this process
129	 * has ever registered information as to how to write this type of
130	 * file.  If not, there's nothing we can do.
131	 */
132	if (mfp->ftype != 0 && mfp->ftype != DB_FTYPE_SET) {
133		MUTEX_LOCK(env, dbmp->mutex);
134		LIST_FOREACH(mpreg, &dbmp->dbregq, q)
135			if (mpreg->ftype == mfp->ftype)
136				break;
137		MUTEX_UNLOCK(env, dbmp->mutex);
138		if (mpreg == NULL)
139			return (EPERM);
140	}
141
142	/*
143	 * Try and open the file, specifying the known underlying shared area.
144	 *
145	 * !!!
146	 * There's no negative cache, so we may repeatedly try and open files
147	 * that we have previously tried (and failed) to open.
148	 */
149	if ((ret = __memp_fcreate(env, &dbmfp)) != 0)
150		return (ret);
151	if ((ret = __memp_fopen(dbmfp, mfp,
152	    NULL, NULL, DB_DURABLE_UNKNOWN, 0, mfp->stat.st_pagesize)) != 0) {
153		(void)__memp_fclose(dbmfp, 0);
154
155		/*
156		 * Ignore any error if the file is marked dead, assume the file
157		 * was removed from under us.
158		 */
159		if (!mfp->deadfile)
160			return (ret);
161
162		dbmfp = NULL;
163	}
164
165pgwrite:
166	MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize,
167	    PROT_READ | PROT_WRITE | PROT_EXEC);
168	ret = __memp_pgwrite(env, dbmfp, hp, bhp);
169	if (dbmfp == NULL)
170		return (ret);
171
172	/*
173	 * Discard our reference, and, if we're the last reference, make sure
174	 * the file eventually gets closed.
175	 */
176	MUTEX_LOCK(env, dbmp->mutex);
177	if (dbmfp->ref == 1)
178		F_SET(dbmfp, MP_FLUSH);
179	else
180		--dbmfp->ref;
181	MUTEX_UNLOCK(env, dbmp->mutex);
182
183	return (ret);
184}
185
186/*
187 * __memp_pgread --
188 *	Read a page from a file.
189 *
190 * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
191 */
192int
193__memp_pgread(dbmfp, bhp, can_create)
194	DB_MPOOLFILE *dbmfp;
195	BH *bhp;
196	int can_create;
197{
198	ENV *env;
199	MPOOLFILE *mfp;
200	size_t len, nr;
201	u_int32_t pagesize;
202	int ret;
203
204	env = dbmfp->env;
205	mfp = dbmfp->mfp;
206	pagesize = mfp->stat.st_pagesize;
207
208	/* We should never be called with a dirty or unlocked buffer. */
209	DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY_CREATE | BH_FROZEN));
210	DB_ASSERT(env, can_create || !F_ISSET(bhp, BH_DIRTY));
211	DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE));
212
213	/* Mark the buffer as in transistion. */
214	F_SET(bhp, BH_TRASH);
215
216	/*
217	 * Temporary files may not yet have been created.  We don't create
218	 * them now, we create them when the pages have to be flushed.
219	 */
220	nr = 0;
221	if (dbmfp->fhp != NULL)
222		if ((ret = __os_io(env, DB_IO_READ, dbmfp->fhp,
223		    bhp->pgno, pagesize, 0, pagesize, bhp->buf, &nr)) != 0)
224			goto err;
225
226	/*
227	 * The page may not exist; if it doesn't, nr may well be 0, but we
228	 * expect the underlying OS calls not to return an error code in
229	 * this case.
230	 */
231	if (nr < pagesize) {
232		/*
233		 * Don't output error messages for short reads.  In particular,
234		 * DB recovery processing may request pages never written to
235		 * disk or for which only some part have been written to disk,
236		 * in which case we won't find the page.  The caller must know
237		 * how to handle the error.
238		 */
239		if (!can_create) {
240			ret = DB_PAGE_NOTFOUND;
241			goto err;
242		}
243
244		/* Clear any bytes that need to be cleared. */
245		len = mfp->clear_len == DB_CLEARLEN_NOTSET ?
246		    pagesize : mfp->clear_len;
247		memset(bhp->buf, 0, len);
248
249#if defined(DIAGNOSTIC) || defined(UMRW)
250		/*
251		 * If we're running in diagnostic mode, corrupt any bytes on
252		 * the page that are unknown quantities for the caller.
253		 */
254		if (len < pagesize)
255			memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
256#endif
257#ifdef HAVE_STATISTICS
258		++mfp->stat.st_page_create;
259	} else
260		++mfp->stat.st_page_in;
261#else
262	}
263#endif
264
265	/* Call any pgin function. */
266	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1);
267
268	/*
269	 * If no errors occurred, the data is now valid, clear the BH_TRASH
270	 * flag.
271	 */
272	if (ret == 0)
273		F_CLR(bhp, BH_TRASH);
274err:	return (ret);
275}
276
277/*
278 * __memp_pgwrite --
279 *	Write a page to a file.
280 */
281static int
282__memp_pgwrite(env, dbmfp, hp, bhp)
283	ENV *env;
284	DB_MPOOLFILE *dbmfp;
285	DB_MPOOL_HASH *hp;
286	BH *bhp;
287{
288	DB_LSN lsn;
289	MPOOLFILE *mfp;
290	size_t nw;
291	int ret;
292	void * buf;
293
294	/*
295	 * Since writing does not require exclusive access, another thread
296	 * could have already written this buffer.
297	 */
298	if (!F_ISSET(bhp, BH_DIRTY))
299		return (0);
300
301	mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
302	ret = 0;
303	buf = NULL;
304
305	/* We should never be called with a frozen or trashed buffer. */
306	DB_ASSERT(env, !F_ISSET(bhp, BH_FROZEN | BH_TRASH));
307
308	/*
309	 * It's possible that the underlying file doesn't exist, either
310	 * because of an outright removal or because it was a temporary
311	 * file that's been closed.
312	 *
313	 * !!!
314	 * Once we pass this point, we know that dbmfp and mfp aren't NULL,
315	 * and that we have a valid file reference.
316	 */
317	if (mfp == NULL || mfp->deadfile)
318		goto file_dead;
319
320	/*
321	 * If the page is in a file for which we have LSN information, we have
322	 * to ensure the appropriate log records are on disk.
323	 */
324	if (LOGGING_ON(env) && mfp->lsn_off != DB_LSN_OFF_NOTSET &&
325	    !IS_CLIENT_PGRECOVER(env)) {
326		memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
327		if (!IS_NOT_LOGGED_LSN(lsn) &&
328		    (ret = __log_flush(env, &lsn)) != 0)
329			goto err;
330	}
331
332#ifdef DIAGNOSTIC
333	/*
334	 * Verify write-ahead logging semantics.
335	 *
336	 * !!!
337	 * Two special cases.  There is a single field on the meta-data page,
338	 * the last-page-number-in-the-file field, for which we do not log
339	 * changes.  If the page was originally created in a database that
340	 * didn't have logging turned on, we can see a page marked dirty but
341	 * for which no corresponding log record has been written.  However,
342	 * the only way that a page can be created for which there isn't a
343	 * previous log record and valid LSN is when the page was created
344	 * without logging turned on, and so we check for that special-case
345	 * LSN value.
346	 *
347	 * Second, when a client is reading database pages from a master
348	 * during an internal backup, we may get pages modified after
349	 * the current end-of-log.
350	 */
351	if (LOGGING_ON(env) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf)) &&
352	    !IS_CLIENT_PGRECOVER(env)) {
353		/*
354		 * There is a potential race here.  If we are in the midst of
355		 * switching log files, it's possible we could test against the
356		 * old file and the new offset in the log region's LSN.  If we
357		 * fail the first test, acquire the log mutex and check again.
358		 */
359		DB_LOG *dblp;
360		LOG *lp;
361
362		dblp = env->lg_handle;
363		lp = dblp->reginfo.primary;
364		if (!lp->db_log_inmemory &&
365		    LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) <= 0) {
366			MUTEX_LOCK(env, lp->mtx_flush);
367			DB_ASSERT(env,
368			    LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) > 0);
369			MUTEX_UNLOCK(env, lp->mtx_flush);
370		}
371	}
372#endif
373
374	/*
375	 * Call any pgout function.  If we have the page exclusive then
376	 * we are going to reuse it otherwise make a copy of the page so
377	 * that others can continue looking at the page while we write it.
378	 */
379	buf = bhp->buf;
380	if (mfp->ftype != 0) {
381		if (F_ISSET(bhp, BH_EXCLUSIVE))
382			F_SET(bhp, BH_TRASH);
383		else {
384			if ((ret =
385			    __os_malloc(env, mfp->stat.st_pagesize, &buf)) != 0)
386				goto err;
387			memcpy(buf, bhp->buf, mfp->stat.st_pagesize);
388		}
389		if ((ret = __memp_pg(dbmfp, bhp->pgno, buf, 0)) != 0)
390			goto err;
391	}
392
393	/* Write the page. */
394	if ((ret = __os_io(
395	    env, DB_IO_WRITE, dbmfp->fhp, bhp->pgno, mfp->stat.st_pagesize,
396	    0, mfp->stat.st_pagesize, buf, &nw)) != 0) {
397		__db_errx(env, "%s: write failed for page %lu",
398		    __memp_fn(dbmfp), (u_long)bhp->pgno);
399		goto err;
400	}
401	STAT(++mfp->stat.st_page_out);
402	if (bhp->pgno > mfp->last_flushed_pgno) {
403		MUTEX_LOCK(env, mfp->mutex);
404		if (bhp->pgno > mfp->last_flushed_pgno)
405			mfp->last_flushed_pgno = bhp->pgno;
406		MUTEX_UNLOCK(env, mfp->mutex);
407	}
408
409err:
410file_dead:
411	if (buf != NULL && buf != bhp->buf)
412		__os_free(env, buf);
413	/*
414	 * !!!
415	 * Once we pass this point, dbmfp and mfp may be NULL, we may not have
416	 * a valid file reference.
417	 */
418
419	/*
420	 * Update the hash bucket statistics, reset the flags.  If we were
421	 * successful, the page is no longer dirty.  Someone else may have
422	 * also written the page so we need to latch the hash bucket here
423	 * to get the accounting correct.  Since we have the buffer
424	 * shared it cannot be marked dirty again till we release it.
425	 * This is the only place we update the flags field only holding
426	 * a shared latch.
427	 */
428	if (F_ISSET(bhp, BH_DIRTY | BH_TRASH)) {
429		MUTEX_LOCK(env, hp->mtx_hash);
430		DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
431		if (ret == 0 && F_ISSET(bhp, BH_DIRTY)) {
432			F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
433			DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
434			atomic_dec(env, &hp->hash_page_dirty);
435		}
436
437		/* put the page back if necessary. */
438		if ((ret != 0 || BH_REFCOUNT(bhp) > 1) &&
439		    F_ISSET(bhp, BH_TRASH)) {
440			ret = __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1);
441			F_CLR(bhp, BH_TRASH);
442		}
443		MUTEX_UNLOCK(env, hp->mtx_hash);
444	}
445
446	return (ret);
447}
448
449/*
450 * __memp_pg --
451 *	Call the pgin/pgout routine.
452 *
453 * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, db_pgno_t, void *, int));
454 */
455int
456__memp_pg(dbmfp, pgno, buf, is_pgin)
457	DB_MPOOLFILE *dbmfp;
458	db_pgno_t pgno;
459	void *buf;
460	int is_pgin;
461{
462	DBT dbt, *dbtp;
463	DB_MPOOL *dbmp;
464	DB_MPREG *mpreg;
465	ENV *env;
466	MPOOLFILE *mfp;
467	int ftype, ret;
468
469	env = dbmfp->env;
470	dbmp = env->mp_handle;
471	mfp = dbmfp->mfp;
472
473	if ((ftype = mfp->ftype) == DB_FTYPE_SET)
474		mpreg = dbmp->pg_inout;
475	else {
476		MUTEX_LOCK(env, dbmp->mutex);
477		LIST_FOREACH(mpreg, &dbmp->dbregq, q)
478			if (ftype == mpreg->ftype)
479				break;
480		MUTEX_UNLOCK(env, dbmp->mutex);
481	}
482	if (mpreg == NULL)
483		return (0);
484
485	if (mfp->pgcookie_len == 0)
486		dbtp = NULL;
487	else {
488		DB_SET_DBT(dbt, R_ADDR(
489		    dbmp->reginfo, mfp->pgcookie_off), mfp->pgcookie_len);
490		dbtp = &dbt;
491	}
492
493	if (is_pgin) {
494		if (mpreg->pgin != NULL && (ret =
495		    mpreg->pgin(env->dbenv, pgno, buf, dbtp)) != 0)
496			goto err;
497	} else
498		if (mpreg->pgout != NULL && (ret =
499		    mpreg->pgout(env->dbenv, pgno, buf, dbtp)) != 0)
500			goto err;
501
502	return (0);
503
504err:	__db_errx(env, "%s: %s failed for page %lu",
505	    __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)pgno);
506	return (ret);
507}
508
509/*
510 * __memp_bhfree --
511 *	Free a bucket header and its referenced data.
512 *
513 * PUBLIC: int __memp_bhfree __P((DB_MPOOL *,
514 * PUBLIC:	REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t));
515 */
516int
517__memp_bhfree(dbmp, infop, mfp, hp, bhp, flags)
518	DB_MPOOL *dbmp;
519	REGINFO *infop;
520	MPOOLFILE *mfp;
521	DB_MPOOL_HASH *hp;
522	BH *bhp;
523	u_int32_t flags;
524{
525	ENV *env;
526#ifdef DIAGNOSTIC
527	DB_LSN vlsn;
528#endif
529	BH *prev_bhp;
530	MPOOL *c_mp;
531	int ret, t_ret;
532#ifdef DIAG_MVCC
533	size_t pagesize;
534#endif
535
536	ret = 0;
537
538	/*
539	 * Assumes the hash bucket is locked and the MPOOL is not.
540	 */
541	env = dbmp->env;
542#ifdef DIAG_MVCC
543	if (mfp != NULL)
544		pagesize = mfp->stat.st_pagesize;
545#endif
546
547	DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) ||
548	    (hp != NULL && MUTEX_IS_OWNED(env, hp->mtx_hash)));
549	DB_ASSERT(env, BH_REFCOUNT(bhp) == 1 &&
550	    !F_ISSET(bhp, BH_DIRTY | BH_FROZEN));
551	DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) ||
552	    SH_CHAIN_SINGLETON(bhp, vc) || (SH_CHAIN_HASNEXT(bhp, vc) &&
553	    (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off ||
554	    bhp->td_off == INVALID_ROFF ||
555	    IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) ||
556	    BH_OBSOLETE(bhp, hp->old_reader, vlsn))));
557
558	/*
559	 * Delete the buffer header from the hash bucket queue or the
560	 * version chain.
561	 */
562	if (hp == NULL)
563		goto no_hp;
564	prev_bhp = SH_CHAIN_PREV(bhp, vc, __bh);
565	if (!SH_CHAIN_HASNEXT(bhp, vc)) {
566		if (prev_bhp != NULL)
567			SH_TAILQ_INSERT_AFTER(&hp->hash_bucket,
568			    bhp, prev_bhp, hq, __bh);
569		SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
570	}
571	SH_CHAIN_REMOVE(bhp, vc, __bh);
572
573	/*
574	 * Remove the reference to this buffer from the transaction that
575	 * created it, if any.  When the BH_FREE_UNLOCKED flag is set, we're
576	 * discarding the environment, so the transaction region is already
577	 * gone.
578	 */
579	if (bhp->td_off != INVALID_ROFF && !LF_ISSET(BH_FREE_UNLOCKED)) {
580		ret = __txn_remove_buffer(
581		    env, BH_OWNER(env, bhp), hp->mtx_hash);
582		bhp->td_off = INVALID_ROFF;
583	}
584
585	/*
586	 * We're going to use the memory for something else -- it had better be
587	 * accessible.
588	 */
589no_hp:	MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE | PROT_EXEC);
590
591	/*
592	 * Discard the hash bucket's mutex, it's no longer needed, and
593	 * we don't want to be holding it when acquiring other locks.
594	 */
595	if (!LF_ISSET(BH_FREE_UNLOCKED))
596		MUTEX_UNLOCK(env, hp->mtx_hash);
597
598	/*
599	 * If we're only removing this header from the chain for reuse, we're
600	 * done.
601	 */
602	if (LF_ISSET(BH_FREE_REUSE))
603		return (ret);
604
605	/*
606	 * If we're not reusing the buffer immediately, free the buffer for
607	 * real.
608	 */
609	if (!LF_ISSET(BH_FREE_UNLOCKED))
610		MUTEX_UNLOCK(env, bhp->mtx_buf);
611	if (LF_ISSET(BH_FREE_FREEMEM)) {
612		if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0)
613			return (ret);
614		MPOOL_REGION_LOCK(env, infop);
615
616		MVCC_BHUNALIGN(bhp);
617		__memp_free(infop, bhp);
618		c_mp = infop->primary;
619		c_mp->stat.st_pages--;
620
621		MPOOL_REGION_UNLOCK(env, infop);
622	}
623
624	if (mfp == NULL)
625		return (ret);
626
627	/*
628	 * Decrement the reference count of the underlying MPOOLFILE.
629	 * If this is its last reference, remove it.
630	 */
631	MUTEX_LOCK(env, mfp->mutex);
632	if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0) {
633		if ((t_ret = __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0)
634			ret = t_ret;
635	} else
636		MUTEX_UNLOCK(env, mfp->mutex);
637
638	return (ret);
639}
640