1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/log.h"
13#include "dbinc/mp.h"
14#include "dbinc/db_page.h"
15#include "dbinc/hash.h"
16
17static int __memp_mpf_alloc __P((DB_MPOOL *,
18    DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **));
19static int __memp_mpf_find __P((ENV *,
20    DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **));
21
22/*
23 * __memp_fopen_pp --
24 *	DB_MPOOLFILE->open pre/post processing.
25 *
26 * PUBLIC: int __memp_fopen_pp
27 * PUBLIC:     __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t));
28 */
29int
30__memp_fopen_pp(dbmfp, path, flags, mode, pagesize)
31	DB_MPOOLFILE *dbmfp;
32	const char *path;
33	u_int32_t flags;
34	int mode;
35	size_t pagesize;
36{
37	DB_THREAD_INFO *ip;
38	ENV *env;
39	int ret;
40
41	env = dbmfp->env;
42
43	/* Validate arguments. */
44	if ((ret = __db_fchk(env, "DB_MPOOLFILE->open", flags,
45	    DB_CREATE | DB_DIRECT | DB_EXTENT | DB_MULTIVERSION |
46	    DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
47		return (ret);
48
49	/*
50	 * Require a non-zero, power-of-two pagesize, smaller than the
51	 * clear length.
52	 */
53	if (pagesize == 0 || !POWER_OF_TWO(pagesize)) {
54		__db_errx(env,
55		    "DB_MPOOLFILE->open: page sizes must be a power-of-2");
56		return (EINVAL);
57	}
58	if (dbmfp->clear_len > pagesize) {
59		__db_errx(env,
60		    "DB_MPOOLFILE->open: clear length larger than page size");
61		return (EINVAL);
62	}
63
64	/* Read-only checks, and local flag. */
65	if (LF_ISSET(DB_RDONLY) && path == NULL) {
66		__db_errx(env,
67		    "DB_MPOOLFILE->open: temporary files can't be readonly");
68		return (EINVAL);
69	}
70
71	if (LF_ISSET(DB_MULTIVERSION) && !TXN_ON(env)) {
72		__db_errx(env,
73		   "DB_MPOOLFILE->open: DB_MULTIVERSION requires transactions");
74		return (EINVAL);
75	}
76
77	ENV_ENTER(env, ip);
78	REPLICATION_WRAP(env,
79	    (__memp_fopen(dbmfp, NULL,
80	    path, NULL, flags, mode, pagesize)), 0, ret);
81	ENV_LEAVE(env, ip);
82	return (ret);
83}
84
85/*
86 * __memp_fopen --
87 *	DB_MPOOLFILE->open.
88 *
89 * PUBLIC: int __memp_fopen __P((DB_MPOOLFILE *, MPOOLFILE *,
90 * PUBLIC:     const char *, const char **, u_int32_t, int, size_t));
91 */
92int
93__memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize)
94	DB_MPOOLFILE *dbmfp;
95	MPOOLFILE *mfp;
96	const char *path;
97	const char **dirp;
98	u_int32_t flags;
99	int mode;
100	size_t pgsize;
101{
102	DB_ENV *dbenv;
103	DB_MPOOL *dbmp;
104	DB_MPOOLFILE *tmp_dbmfp;
105	DB_MPOOL_HASH *hp;
106	ENV *env;
107	MPOOL *mp;
108	MPOOLFILE *alloc_mfp;
109	size_t maxmap;
110	db_pgno_t last_pgno;
111	u_int32_t bucket, mbytes, bytes, oflags, pagesize;
112	int refinc, ret;
113	char *rpath;
114
115	/* If this handle is already open, return. */
116	if (F_ISSET(dbmfp, MP_OPEN_CALLED))
117		return (0);
118
119	env = dbmfp->env;
120	dbmp = env->mp_handle;
121	dbenv = env->dbenv;
122	mp = dbmp->reginfo[0].primary;
123	alloc_mfp = NULL;
124	mbytes = bytes = 0;
125	refinc = ret = 0;
126	rpath = NULL;
127
128	/*
129	 * We're keeping the page size as a size_t in the public API, but
130	 * it's a u_int32_t everywhere internally.
131	 */
132	pagesize = (u_int32_t)pgsize;
133
134	/*
135	 * We're called internally with a specified mfp, in which case the
136	 * path is NULL, but we'll get the path from the underlying region
137	 * information.  Otherwise, if the path is NULL, it's a temporary
138	 * file -- we know we can't join any existing files, and we'll delay
139	 * the open until we actually need to write the file. All temporary
140	 * files will go into the first hash bucket.
141	 */
142	DB_ASSERT(env, mfp == NULL || path == NULL);
143
144	bucket = 0;
145	hp = R_ADDR(dbmp->reginfo, mp->ftab);
146	if (mfp == NULL) {
147		if (path == NULL)
148			goto alloc;
149
150		/*
151		 * Hash to the proper file table entry and walk it.
152		 *
153		 * The fileID is a filesystem unique number (e.g., a
154		 * UNIX dev/inode pair) plus a timestamp.  If files are
155		 * removed and created in less than a second, the fileID
156		 * can be repeated.  The problem with repetition happens
157		 * when the file that previously had the fileID value still
158		 * has pages in the pool, since we don't want to use them
159		 * to satisfy requests for the new file. Because the
160		 * DB_TRUNCATE flag reuses the dev/inode pair, repeated
161		 * opens with that flag set guarantees matching fileIDs
162		 * when the machine can open a file and then re-open
163		 * with truncate within a second.  For this reason, we
164		 * pass that flag down, and, if we find a matching entry,
165		 * we ensure that it's never found again, and we create
166		 * a new entry for the current request.
167		 */
168
169		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
170			bucket = FNBUCKET(path, strlen(path));
171		else
172			bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN);
173		hp += bucket;
174
175		/*
176		 * If we are passed a FILEID find the MPOOLFILE and inc
177		 * its ref count.  That way it cannot go away while we
178		 * open it.
179		 */
180		if (F_ISSET(dbmfp, MP_FILEID_SET)) {
181			MUTEX_LOCK(env, hp->mtx_hash);
182			ret =
183			    __memp_mpf_find(env, dbmfp, hp, path, flags,&mfp);
184			MUTEX_UNLOCK(env, hp->mtx_hash);
185			if (ret != 0)
186				goto err;
187			if (mfp != NULL)
188				refinc = 1;
189		}
190	} else {
191		/*
192		 * Deadfile can only be set if mpf_cnt goes to zero (or if we
193		 * failed creating the file DB_AM_DISCARD).  Increment the ref
194		 * count so the file cannot become dead and be unlinked.
195		 */
196		MUTEX_LOCK(env, mfp->mutex);
197		if (!mfp->deadfile) {
198			++mfp->mpf_cnt;
199			refinc = 1;
200		}
201		MUTEX_UNLOCK(env, mfp->mutex);
202
203		/*
204		 * Test one last time to see if the file is dead -- it may have
205		 * been removed.  This happens when a checkpoint trying to open
206		 * the file to flush a buffer races with the Db::remove method.
207		 * The error will be ignored, so don't output an error message.
208		 */
209		if (mfp->deadfile)
210			return (EINVAL);
211	}
212
213	/*
214	 * If there's no backing file, we can join existing files in the cache,
215	 * but there's nothing to read from disk.
216	 */
217	if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
218		/* Convert MP open flags to DB OS-layer open flags. */
219		oflags = 0;
220		if (LF_ISSET(DB_CREATE))
221			oflags |= DB_OSO_CREATE;
222		if (LF_ISSET(DB_DIRECT))
223			oflags |= DB_OSO_DIRECT;
224		if (LF_ISSET(DB_RDONLY)) {
225			F_SET(dbmfp, MP_READONLY);
226			oflags |= DB_OSO_RDONLY;
227		}
228
229		/*
230		 * XXX
231		 * A grievous layering violation, the DB_DSYNC_DB flag
232		 * was left in the ENV structure and not driven through
233		 * the cache API.  This needs to be fixed when the general
234		 * API configuration is fixed.
235		 */
236		if (F_ISSET(env->dbenv, DB_ENV_DSYNC_DB))
237			oflags |= DB_OSO_DSYNC;
238
239		/*
240		 * Get the real name for this file and open it.
241		 *
242		 * Supply a page size so os_open can decide whether to
243		 * turn buffering off if the DB_DIRECT_DB flag is set.
244		 *
245		 * Acquire the region lock if we're using a path from
246		 * an underlying MPOOLFILE -- there's a race in accessing
247		 * the path name stored in the region, __memp_nameop may
248		 * be simultaneously renaming the file.
249		 */
250		if (mfp != NULL) {
251			MPOOL_SYSTEM_LOCK(env);
252			path = R_ADDR(dbmp->reginfo, mfp->path_off);
253		}
254		if ((ret = __db_appname(env,
255		     DB_APP_DATA, path, dirp, &rpath)) == 0)
256			ret = __os_open(env, rpath,
257			     (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp);
258		if (mfp != NULL)
259			MPOOL_SYSTEM_UNLOCK(env);
260		if (ret != 0)
261			goto err;
262
263		/*
264		 * Cache file handles are shared, and have mutexes to
265		 * protect the underlying file handle across seek and
266		 * read/write calls.
267		 */
268		dbmfp->fhp->ref = 1;
269		if ((ret = __mutex_alloc(env, MTX_MPOOL_FH,
270		     DB_MUTEX_PROCESS_ONLY, &dbmfp->fhp->mtx_fh)) != 0)
271			goto err;
272
273		/*
274		 * Figure out the file's size.
275		 *
276		 * !!!
277		 * We can't use off_t's here, or in any code in the mainline
278		 * library for that matter.  (We have to use them in the
279		 * os stubs, of course, as there are system calls that
280		 * take them as arguments.)  The reason is some customers
281		 * build in environments where an off_t is 32-bits, but
282		 * still run where offsets are 64-bits, and they pay us
283		 * a lot of money.
284		 */
285		if ((ret = __os_ioinfo(
286		    env, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
287			__db_err(env, ret, "%s", rpath);
288			goto err;
289		}
290
291		/*
292		 * Don't permit files that aren't a multiple of the pagesize,
293		 * and find the number of the last page in the file, all the
294		 * time being careful not to overflow 32 bits.
295		 *
296		 * During verify or recovery, we might have to cope with a
297		 * truncated file; if the file size is not a multiple of the
298		 * page size, round down to a page, we'll take care of the
299		 * partial page outside the mpool system.
300		 */
301		DB_ASSERT(env, pagesize != 0);
302		if (bytes % pagesize != 0) {
303			if (LF_ISSET(DB_ODDFILESIZE))
304				bytes -= (u_int32_t)(bytes % pagesize);
305			else {
306				__db_errx(env,
307		    "%s: file size not a multiple of the pagesize", rpath);
308				ret = EINVAL;
309				goto err;
310			}
311		}
312
313		/*
314		 * Get the file id if we weren't given one.  Generated file id's
315		 * don't use timestamps, otherwise there'd be no chance of any
316		 * other process joining the party.  Don't bother looking for
317		 * this id in the hash table, its new.
318		 */
319		if (mfp == NULL && !F_ISSET(dbmfp, MP_FILEID_SET)) {
320			if  ((ret =
321			     __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0)
322				goto err;
323			F_SET(dbmfp, MP_FILEID_SET);
324			goto alloc;
325		}
326	}
327
328	if (mfp != NULL)
329		goto have_mfp;
330
331	/*
332	 * We can race with another process opening the same file when
333	 * we allocate the mpoolfile structure.  We will come back
334	 * here and check the hash table again to see if it has appeared.
335	 * For most files this is not a problem, since the name is locked
336	 * at a higher layer but QUEUE extent files are not locked.
337	 */
338check:	MUTEX_LOCK(env, hp->mtx_hash);
339	if ((ret = __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp) != 0))
340		goto err;
341
342	if (alloc_mfp != NULL && mfp == NULL) {
343		mfp = alloc_mfp;
344		alloc_mfp = NULL;
345		SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile);
346	} else if (mfp != NULL) {
347		/*
348		 * Some things about a file cannot be changed: the clear length,
349		 * page size, or LSN location.  However, if this is an attempt
350		 * to open a named in-memory file, we may not yet have that
351		 * information. so accept uninitialized entries.
352		 *
353		 * The file type can change if the application's pre- and post-
354		 * processing needs change.  For example, an application that
355		 * created a hash subdatabase in a database that was previously
356		 * all btree.
357		 *
358		 * !!!
359		 * We do not check to see if the pgcookie information changed,
360		 * or update it if it is.
361		 */
362		if ((dbmfp->clear_len != DB_CLEARLEN_NOTSET &&
363		    mfp->clear_len != DB_CLEARLEN_NOTSET &&
364		    dbmfp->clear_len != mfp->clear_len) ||
365		    (pagesize != 0 && pagesize != mfp->stat.st_pagesize) ||
366		    (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET &&
367		    mfp->lsn_off != DB_LSN_OFF_NOTSET &&
368		    dbmfp->lsn_offset != mfp->lsn_off)) {
369			__db_errx(env,
370		    "%s: clear length, page size or LSN location changed",
371			    path);
372			MUTEX_UNLOCK(env, hp->mtx_hash);
373			ret = EINVAL;
374			goto err;
375		}
376	}
377
378	MUTEX_UNLOCK(env, hp->mtx_hash);
379	if (alloc_mfp != NULL) {
380		MUTEX_LOCK(env, alloc_mfp->mutex);
381		if ((ret = __memp_mf_discard(dbmp, alloc_mfp)) != 0)
382			goto err;
383	}
384
385	if (mfp == NULL) {
386		/*
387		 * If we didn't find the file and this is an in-memory file,
388		 * then the create flag should be set.
389		 */
390		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) &&
391		    !LF_ISSET(DB_CREATE)) {
392			ret = ENOENT;
393			goto err;
394		}
395
396alloc:		/*
397		 * Get the file ID if we weren't given one.  Generated file
398		 * ID's don't use timestamps, otherwise there'd be no
399		 * chance of any other process joining the party.
400		 */
401		if (path != NULL &&
402		     !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) &&
403		     !F_ISSET(dbmfp, MP_FILEID_SET) && (ret =
404			    __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0)
405				goto err;
406
407		if ((ret = __memp_mpf_alloc(dbmp,
408		     dbmfp, path, pagesize, flags, &alloc_mfp)) != 0)
409			goto err;
410
411		/*
412		 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a
413		 * page get, we have to increment the last page in the file.
414		 * Figure it out and save it away.
415		 *
416		 * Note correction: page numbers are zero-based, not 1-based.
417		 */
418		DB_ASSERT(env, pagesize != 0);
419		last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize));
420		last_pgno += (db_pgno_t)(bytes / pagesize);
421		if (last_pgno != 0)
422			--last_pgno;
423
424		alloc_mfp->last_flushed_pgno = alloc_mfp->orig_last_pgno =
425		    alloc_mfp->last_pgno = last_pgno;
426
427		alloc_mfp->bucket = bucket;
428
429		/* Go back and see if someone else has opened the file. */
430		if (path != NULL)
431			goto check;
432
433		mfp = alloc_mfp;
434		/* This is a temp, noone else can see it, put it at the end. */
435		MUTEX_LOCK(env, hp->mtx_hash);
436		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, mfp, q);
437		MUTEX_UNLOCK(env, hp->mtx_hash);
438	}
439have_mfp:
440	/*
441	 * We need to verify that all handles open a file either durable or not
442	 * durable.  This needs to be cross process and cross sub-databases, so
443	 * mpool is the place to do it.
444	 */
445	if (!LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) {
446		if (F_ISSET(mfp, MP_DURABLE_UNKNOWN)) {
447			if (LF_ISSET(DB_TXN_NOT_DURABLE))
448				F_SET(mfp, MP_NOT_DURABLE);
449			F_CLR(mfp, MP_DURABLE_UNKNOWN);
450		} else if (!LF_ISSET(DB_TXN_NOT_DURABLE) !=
451		    !F_ISSET(mfp, MP_NOT_DURABLE)) {
452			__db_errx(env,
453	     "Cannot open DURABLE and NOT DURABLE handles in the same file");
454			ret = EINVAL;
455			goto err;
456		}
457	}
458
459	if (LF_ISSET(DB_MULTIVERSION)) {
460		++mfp->multiversion;
461		F_SET(dbmfp, MP_MULTIVERSION);
462	}
463
464	/*
465	 * All paths to here have initialized the mfp variable to reference
466	 * the selected (or allocated) MPOOLFILE.
467	 */
468	dbmfp->mfp = mfp;
469
470	/*
471	 * Check to see if we can mmap the file.  If a file:
472	 *	+ isn't temporary
473	 *	+ is read-only
474	 *	+ doesn't require any pgin/pgout support
475	 *	+ the DB_NOMMAP flag wasn't set (in either the file open or
476	 *	  the environment in which it was opened)
477	 *	+ and is less than mp_mmapsize bytes in size
478	 *
479	 * we can mmap it instead of reading/writing buffers.  Don't do error
480	 * checking based on the mmap call failure.  We want to do normal I/O
481	 * on the file if the reason we failed was because the file was on an
482	 * NFS mounted partition, and we can fail in buffer I/O just as easily
483	 * as here.
484	 *
485	 * We'd like to test to see if the file is too big to mmap.  Since we
486	 * don't know what size or type off_t's or size_t's are, or the largest
487	 * unsigned integral type is, or what random insanity the local C
488	 * compiler will perpetrate, doing the comparison in a portable way is
489	 * flatly impossible.  Hope that mmap fails if the file is too large.
490	 */
491#define	DB_MAXMMAPSIZE	(10 * 1024 * 1024)	/* 10 MB. */
492	if (F_ISSET(mfp, MP_CAN_MMAP)) {
493		maxmap = dbenv->mp_mmapsize == 0 ?
494		    DB_MAXMMAPSIZE : dbenv->mp_mmapsize;
495		if (path == NULL ||
496		    FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
497			F_CLR(mfp, MP_CAN_MMAP);
498		else if (!F_ISSET(dbmfp, MP_READONLY))
499			F_CLR(mfp, MP_CAN_MMAP);
500		else if (dbmfp->ftype != 0)
501			F_CLR(mfp, MP_CAN_MMAP);
502		else if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP))
503			F_CLR(mfp, MP_CAN_MMAP);
504		else {
505			MPOOL_SYSTEM_LOCK(env);
506			maxmap = mp->mp_mmapsize == 0 ?
507			    DB_MAXMMAPSIZE : mp->mp_mmapsize;
508			MPOOL_SYSTEM_UNLOCK(env);
509			if (mbytes > maxmap / MEGABYTE ||
510			    (mbytes == maxmap / MEGABYTE &&
511			    bytes >= maxmap % MEGABYTE))
512				F_CLR(mfp, MP_CAN_MMAP);
513		}
514
515		dbmfp->addr = NULL;
516		if (F_ISSET(mfp, MP_CAN_MMAP)) {
517			dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
518			if (__os_mapfile(env, rpath,
519			    dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) {
520				dbmfp->addr = NULL;
521				F_CLR(mfp, MP_CAN_MMAP);
522			}
523		}
524	}
525
526	F_SET(dbmfp, MP_OPEN_CALLED);
527
528	/*
529	 * Share the underlying file descriptor if that's possible.
530	 *
531	 * Add the file to the process' list of DB_MPOOLFILEs.
532	 */
533	MUTEX_LOCK(env, dbmp->mutex);
534
535	if (dbmfp->fhp != NULL)
536		TAILQ_FOREACH(tmp_dbmfp, &dbmp->dbmfq, q)
537			if (dbmfp->mfp == tmp_dbmfp->mfp &&
538			    (F_ISSET(dbmfp, MP_READONLY) ||
539			    !F_ISSET(tmp_dbmfp, MP_READONLY))) {
540				(void)__mutex_free(env, &dbmfp->fhp->mtx_fh);
541				(void)__os_closehandle(env, dbmfp->fhp);
542				++tmp_dbmfp->fhp->ref;
543				dbmfp->fhp = tmp_dbmfp->fhp;
544				break;
545			}
546
547	TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
548
549	MUTEX_UNLOCK(env, dbmp->mutex);
550
551	if (0) {
552err:		if (refinc) {
553			/*
554			 * If mpf_cnt goes to zero here and unlink_on_close is
555			 * set, then we missed the last close, but there was an
556			 * error trying to open the file, so we probably cannot
557			 * unlink it anyway.
558			 */
559			MUTEX_LOCK(env, mfp->mutex);
560			--mfp->mpf_cnt;
561			MUTEX_UNLOCK(env, mfp->mutex);
562		}
563
564	}
565	if (rpath != NULL)
566		__os_free(env, rpath);
567	return (ret);
568}
569
570/*
571 * __memp_mpf_find --
572 *	Search a hash bucket for a MPOOLFILE.
573 */
574static int
575__memp_mpf_find(env, dbmfp, hp, path, flags, mfpp)
576	ENV *env;
577	DB_MPOOLFILE *dbmfp;
578	DB_MPOOL_HASH *hp;
579	const char *path;
580	u_int32_t flags;
581	MPOOLFILE **mfpp;
582{
583	DB_MPOOL *dbmp;
584	MPOOLFILE *mfp;
585
586	dbmp = env->mp_handle;
587
588	SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
589		/* Skip dead files and temporary files. */
590		if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
591			continue;
592
593		/*
594		 * Any remaining DB_MPOOL_NOFILE databases are in-memory
595		 * named databases and need only match other in-memory
596		 * databases with the same name.
597		 */
598		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
599			if (!mfp->no_backing_file)
600				continue;
601
602			if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off)))
603				continue;
604
605			/*
606			 * We matched an in-memory file; grab the fileid if
607			 * it is set in the region, but not in the dbmfp.
608			 */
609			if (!F_ISSET(dbmfp, MP_FILEID_SET))
610				(void)__memp_set_fileid(dbmfp,
611				    R_ADDR(dbmp->reginfo, mfp->fileid_off));
612		} else
613			if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
614			    mfp->fileid_off), DB_FILE_ID_LEN) != 0)
615				continue;
616
617		/*
618		 * If the file is being truncated, remove it from the system
619		 * and create a new entry.
620		 *
621		 * !!!
622		 * We should be able to set mfp to NULL and break out of the
623		 * loop, but I like the idea of checking all the entries.
624		 */
625		if (LF_ISSET(DB_TRUNCATE)) {
626			MUTEX_LOCK(env, mfp->mutex);
627			mfp->deadfile = 1;
628			MUTEX_UNLOCK(env, mfp->mutex);
629			continue;
630		}
631
632		/*
633		 * Check to see if this file has died while we waited.
634		 *
635		 * We normally don't lock the deadfile field when we read it as
636		 * we only care if the field is zero or non-zero.  We do lock
637		 * on read when searching for a matching MPOOLFILE so that two
638		 * threads of control don't race between setting the deadfile
639		 * bit and incrementing the reference count, that is, a thread
640		 * of control decrementing the reference count and then setting
641		 * deadfile because the reference count is 0 blocks us finding
642		 * the file without knowing it's about to be marked dead.
643		 */
644		MUTEX_LOCK(env, mfp->mutex);
645		if (mfp->deadfile) {
646			MUTEX_UNLOCK(env, mfp->mutex);
647			continue;
648		}
649		++mfp->mpf_cnt;
650		MUTEX_UNLOCK(env, mfp->mutex);
651
652		/* Initialize any fields that are not yet set. */
653		if (dbmfp->ftype != 0)
654			mfp->ftype = dbmfp->ftype;
655		if (dbmfp->clear_len != DB_CLEARLEN_NOTSET)
656			mfp->clear_len = dbmfp->clear_len;
657		if (dbmfp->lsn_offset != -1)
658			mfp->lsn_off = dbmfp->lsn_offset;
659
660		break;
661	}
662
663	*mfpp = mfp;
664	return (0);
665}
666
667static int
668__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
669	DB_MPOOL *dbmp;
670	DB_MPOOLFILE *dbmfp;
671	const char *path;
672	u_int32_t pagesize;
673	u_int32_t flags;
674	MPOOLFILE **retmfp;
675{
676	ENV *env;
677	MPOOLFILE *mfp;
678	int ret;
679	void *p;
680
681	env = dbmp->env;
682	ret = 0;
683	/* Allocate and initialize a new MPOOLFILE. */
684	if ((ret = __memp_alloc(dbmp,
685	     dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
686		goto err;
687	memset(mfp, 0, sizeof(MPOOLFILE));
688	mfp->mpf_cnt = 1;
689	mfp->ftype = dbmfp->ftype;
690	mfp->stat.st_pagesize = pagesize;
691	mfp->lsn_off = dbmfp->lsn_offset;
692	mfp->clear_len = dbmfp->clear_len;
693	mfp->priority = dbmfp->priority;
694	if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) {
695		mfp->maxpgno = (db_pgno_t)
696		    (dbmfp->gbytes * (GIGABYTE / mfp->stat.st_pagesize));
697		mfp->maxpgno += (db_pgno_t)
698		    ((dbmfp->bytes + mfp->stat.st_pagesize - 1) /
699		    mfp->stat.st_pagesize);
700	}
701	if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
702		mfp->no_backing_file = 1;
703	if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK))
704		mfp->unlink_on_close = 1;
705
706	if (LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY))
707		F_SET(mfp, MP_DURABLE_UNKNOWN);
708	if (LF_ISSET(DB_DIRECT))
709		F_SET(mfp, MP_DIRECT);
710	if (LF_ISSET(DB_EXTENT))
711		F_SET(mfp, MP_EXTENT);
712	if (LF_ISSET(DB_TXN_NOT_DURABLE))
713		F_SET(mfp, MP_NOT_DURABLE);
714	F_SET(mfp, MP_CAN_MMAP);
715
716	/*
717	 * An in-memory database with no name is a temp file.  Named
718	 * in-memory databases get an artificially  bumped reference
719	 * count so they don't disappear on close; they need a remove
720	 * to make them disappear.
721	 */
722	if (path == NULL)
723		F_SET(mfp, MP_TEMP);
724	else if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
725		mfp->mpf_cnt++;
726
727	/* Copy the file identification string into shared memory. */
728	if (F_ISSET(dbmfp, MP_FILEID_SET)) {
729		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
730		    NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
731			goto err;
732		memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN);
733	}
734
735	/* Copy the file path into shared memory. */
736	if (path != NULL) {
737		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
738		    NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
739			goto err;
740		memcpy(p, path, strlen(path) + 1);
741	}
742
743	/* Copy the page cookie into shared memory. */
744	if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) {
745		mfp->pgcookie_len = 0;
746		mfp->pgcookie_off = 0;
747	} else {
748		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
749		    NULL, dbmfp->pgcookie->size,
750		    &mfp->pgcookie_off, &p)) != 0)
751			goto err;
752		memcpy(p,
753		     dbmfp->pgcookie->data, dbmfp->pgcookie->size);
754		mfp->pgcookie_len = dbmfp->pgcookie->size;
755	}
756
757	if ((ret = __mutex_alloc(env,
758	    MTX_MPOOLFILE_HANDLE, 0, &mfp->mutex)) != 0)
759		goto err;
760	*retmfp = mfp;
761
762err:	return (ret);
763}
764
765/*
766 * memp_fclose_pp --
767 *	DB_MPOOLFILE->close pre/post processing.
768 *
769 * PUBLIC: int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t));
770 */
771int
772__memp_fclose_pp(dbmfp, flags)
773	DB_MPOOLFILE *dbmfp;
774	u_int32_t flags;
775{
776	DB_THREAD_INFO *ip;
777	ENV *env;
778	int ret;
779
780	env = dbmfp->env;
781
782	/*
783	 * Validate arguments, but as a handle destructor, we can't fail.
784	 */
785	if (flags != 0)
786		(void)__db_ferr(env, "DB_MPOOLFILE->close", 0);
787
788	ENV_ENTER(env, ip);
789	REPLICATION_WRAP(env, (__memp_fclose(dbmfp, 0)), 0, ret);
790	ENV_LEAVE(env, ip);
791	return (ret);
792}
793
794/*
795 * __memp_fclose --
796 *	DB_MPOOLFILE->close.
797 *
798 * PUBLIC: int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
799 */
800int
801__memp_fclose(dbmfp, flags)
802	DB_MPOOLFILE *dbmfp;
803	u_int32_t flags;
804{
805	DB_MPOOL *dbmp;
806	ENV *env;
807	MPOOLFILE *mfp;
808	char *rpath;
809	u_int32_t ref;
810	int deleted, ret, t_ret;
811
812	env = dbmfp->env;
813	dbmp = env->mp_handle;
814	ret = 0;
815
816	/*
817	 * Remove the DB_MPOOLFILE from the process' list.
818	 *
819	 * It's possible the underlying mpool cache may never have been created.
820	 * In that case, all we have is a structure, discard it.
821	 *
822	 * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE
823	 * file list, check the MP_OPEN_CALLED flag to be sure.
824	 */
825	if (dbmp == NULL)
826		goto done;
827
828	MUTEX_LOCK(env, dbmp->mutex);
829
830	DB_ASSERT(env, dbmfp->ref >= 1);
831	if ((ref = --dbmfp->ref) == 0 && F_ISSET(dbmfp, MP_OPEN_CALLED))
832		TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
833
834	/*
835	 * Decrement the file descriptor's ref count -- if we're the last ref,
836	 * we'll discard the file descriptor.
837	 */
838	if (ref == 0 && dbmfp->fhp != NULL && --dbmfp->fhp->ref > 0)
839		dbmfp->fhp = NULL;
840	MUTEX_UNLOCK(env, dbmp->mutex);
841	if (ref != 0)
842		return (0);
843
844	/* Complain if pinned blocks never returned. */
845	if (dbmfp->pinref != 0) {
846		__db_errx(env, "%s: close: %lu blocks left pinned",
847		    __memp_fn(dbmfp), (u_long)dbmfp->pinref);
848		ret = __env_panic(env, DB_RUNRECOVERY);
849	}
850
851	/* Discard any mmap information. */
852	if (dbmfp->addr != NULL &&
853	    (ret = __os_unmapfile(env, dbmfp->addr, dbmfp->len)) != 0)
854		__db_err(env, ret, "%s", __memp_fn(dbmfp));
855
856	/*
857	 * Close the file and discard the descriptor structure; temporary
858	 * files may not yet have been created.
859	 */
860	if (dbmfp->fhp != NULL) {
861		if ((t_ret =
862		    __mutex_free(env, &dbmfp->fhp->mtx_fh)) != 0 && ret == 0)
863			ret = t_ret;
864		if ((t_ret = __os_closehandle(env, dbmfp->fhp)) != 0) {
865			__db_err(env, t_ret, "%s", __memp_fn(dbmfp));
866			if (ret == 0)
867				ret = t_ret;
868		}
869		dbmfp->fhp = NULL;
870	}
871
872	/*
873	 * Discard our reference on the underlying MPOOLFILE, and close it
874	 * if it's no longer useful to anyone.  It possible the open of the
875	 * file never happened or wasn't successful, in which case, mpf will
876	 * be NULL and MP_OPEN_CALLED will not be set.
877	 */
878	mfp = dbmfp->mfp;
879	DB_ASSERT(env,
880	    (F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp != NULL) ||
881	    (!F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp == NULL));
882	if (!F_ISSET(dbmfp, MP_OPEN_CALLED))
883		goto done;
884
885	/*
886	 * If it's a temp file, all outstanding references belong to unflushed
887	 * buffers.  (A temp file can only be referenced by one DB_MPOOLFILE).
888	 * We don't care about preserving any of those buffers, so mark the
889	 * MPOOLFILE as dead so that even the dirty ones just get discarded
890	 * when we try to flush them.
891	 */
892	deleted = 0;
893	if (!LF_ISSET(DB_MPOOL_NOLOCK))
894		MUTEX_LOCK(env, mfp->mutex);
895	if (F_ISSET(dbmfp, MP_MULTIVERSION))
896		--mfp->multiversion;
897	if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) {
898		if (LF_ISSET(DB_MPOOL_DISCARD) ||
899		    F_ISSET(mfp, MP_TEMP) || mfp->unlink_on_close) {
900			mfp->deadfile = 1;
901		}
902		if (mfp->unlink_on_close) {
903			if ((t_ret = __db_appname(dbmp->env, DB_APP_DATA,
904			    R_ADDR(dbmp->reginfo, mfp->path_off), NULL,
905			    &rpath)) != 0 && ret == 0)
906				ret = t_ret;
907			if (t_ret == 0) {
908				if ((t_ret = __os_unlink(
909				    dbmp->env, rpath, 0)) != 0 && ret == 0)
910					ret = t_ret;
911				__os_free(env, rpath);
912			}
913		}
914		if (mfp->mpf_cnt == 0) {
915			F_CLR(mfp, MP_NOT_DURABLE);
916			F_SET(mfp, MP_DURABLE_UNKNOWN);
917		}
918		if (mfp->block_cnt == 0) {
919			/*
920			 * We should never discard this mp file if our caller
921			 * is holding the lock on it.  See comment in
922			 * __memp_sync_file.
923			 */
924			DB_ASSERT(env, !LF_ISSET(DB_MPOOL_NOLOCK));
925			if ((t_ret =
926			    __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0)
927				ret = t_ret;
928			deleted = 1;
929		}
930	}
931	if (!deleted && !LF_ISSET(DB_MPOOL_NOLOCK))
932		MUTEX_UNLOCK(env, mfp->mutex);
933
934done:	/* Discard the DB_MPOOLFILE structure. */
935	if (dbmfp->pgcookie != NULL) {
936		__os_free(env, dbmfp->pgcookie->data);
937		__os_free(env, dbmfp->pgcookie);
938	}
939	__os_free(env, dbmfp);
940
941	return (ret);
942}
943
944/*
945 * __memp_mf_discard --
946 *	Discard an MPOOLFILE.
947 *
948 * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *));
949 */
950int
951__memp_mf_discard(dbmp, mfp)
952	DB_MPOOL *dbmp;
953	MPOOLFILE *mfp;
954{
955	DB_MPOOL_HASH *hp;
956	ENV *env;
957#ifdef HAVE_STATISTICS
958	DB_MPOOL_STAT *sp;
959#endif
960	MPOOL *mp;
961	int need_sync, ret, t_ret;
962
963	env = dbmp->env;
964	mp = dbmp->reginfo[0].primary;
965	hp = R_ADDR(dbmp->reginfo, mp->ftab);
966	hp += mfp->bucket;
967	ret = 0;
968
969	/*
970	 * Expects caller to be holding the MPOOLFILE mutex.
971	 *
972	 * When discarding a file, we have to flush writes from it to disk.
973	 * The scenario is that dirty buffers from this file need to be
974	 * flushed to satisfy a future checkpoint, but when the checkpoint
975	 * calls mpool sync, the sync code won't know anything about them.
976	 * Ignore files not written, discarded, or only temporary.
977	 */
978	need_sync =
979	   mfp->file_written && !mfp->deadfile && !F_ISSET(mfp, MP_TEMP);
980
981	/*
982	 * We have to release the MPOOLFILE mutex before acquiring the region
983	 * mutex so we don't deadlock.  Make sure nobody ever looks at this
984	 * structure again.
985	 */
986	mfp->deadfile = 1;
987
988	/* Discard the mutex we're holding and return it too the pool. */
989	MUTEX_UNLOCK(env, mfp->mutex);
990	if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0)
991		ret = t_ret;
992
993	/* Lock the bucket and delete from the list of MPOOLFILEs. */
994	MUTEX_LOCK(env, hp->mtx_hash);
995	SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile);
996	MUTEX_UNLOCK(env, hp->mtx_hash);
997
998	/* Lock the region and collect stats and free the space. */
999	MPOOL_SYSTEM_LOCK(env);
1000	if (need_sync &&
1001	    (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0)
1002		ret = t_ret;
1003
1004#ifdef HAVE_STATISTICS
1005	/* Copy the statistics into the region. */
1006	sp = &mp->stat;
1007	sp->st_cache_hit += mfp->stat.st_cache_hit;
1008	sp->st_cache_miss += mfp->stat.st_cache_miss;
1009	sp->st_map += mfp->stat.st_map;
1010	sp->st_page_create += mfp->stat.st_page_create;
1011	sp->st_page_in += mfp->stat.st_page_in;
1012	sp->st_page_out += mfp->stat.st_page_out;
1013#endif
1014
1015	/* Free the space. */
1016	if (mfp->path_off != 0)
1017		__memp_free(&dbmp->reginfo[0],
1018		    R_ADDR(dbmp->reginfo, mfp->path_off));
1019	if (mfp->fileid_off != 0)
1020		__memp_free(&dbmp->reginfo[0],
1021		    R_ADDR(dbmp->reginfo, mfp->fileid_off));
1022	if (mfp->pgcookie_off != 0)
1023		__memp_free(&dbmp->reginfo[0],
1024		    R_ADDR(dbmp->reginfo, mfp->pgcookie_off));
1025	__memp_free(&dbmp->reginfo[0], mfp);
1026
1027	MPOOL_SYSTEM_UNLOCK(env);
1028
1029	return (ret);
1030}
1031
1032/*
1033 * __memp_inmemlist --
1034 *	Return a list of the named in-memory databases.
1035 *
1036 * PUBLIC: int __memp_inmemlist __P((ENV *, char ***, int *));
1037 */
1038int
1039__memp_inmemlist(env, namesp, cntp)
1040	ENV *env;
1041	char ***namesp;
1042	int *cntp;
1043{
1044	DB_MPOOL *dbmp;
1045	DB_MPOOL_HASH *hp;
1046	MPOOL *mp;
1047	MPOOLFILE *mfp;
1048	int arraysz, cnt, i, ret;
1049	char **names;
1050
1051	names = NULL;
1052	dbmp = env->mp_handle;
1053	mp = dbmp->reginfo[0].primary;
1054	hp = R_ADDR(dbmp->reginfo, mp->ftab);
1055
1056	arraysz = cnt = 0;
1057	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
1058		MUTEX_LOCK(env, hp->mtx_hash);
1059		SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
1060			/* Skip dead files and temporary files. */
1061			if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
1062				continue;
1063
1064			/* Skip entries that allow files. */
1065			if (!mfp->no_backing_file)
1066				continue;
1067
1068			/* We found one. */
1069			if (cnt >= arraysz) {
1070				arraysz += 100;
1071				if ((ret = __os_realloc(env,
1072				    (u_int)arraysz * sizeof(names[0]),
1073				    &names)) != 0)
1074					goto nomem;
1075			}
1076			if ((ret = __os_strdup(env,
1077			    R_ADDR(dbmp->reginfo, mfp->path_off),
1078			    &names[cnt])) != 0)
1079				goto nomem;
1080
1081			cnt++;
1082		}
1083		MUTEX_UNLOCK(env, hp->mtx_hash);
1084	}
1085	*namesp = names;
1086	*cntp = cnt;
1087	return (0);
1088
1089nomem:	MUTEX_UNLOCK(env, hp->mtx_hash);
1090	if (names != NULL) {
1091		while (--cnt >= 0)
1092			__os_free(env, names[cnt]);
1093		__os_free(env, names);
1094	}
1095
1096	/* Make sure we don't return any garbage. */
1097	*cntp = 0;
1098	*namesp = NULL;
1099	return (ret);
1100}
1101