1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: mp_fopen.c,v 12.50 2008/01/31 18:40:45 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/log.h"
13#include "dbinc/mp.h"
14#include "dbinc/db_page.h"
15#include "dbinc/hash.h"
16
17static int __memp_mpf_alloc __P((DB_MPOOL *,
18    DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **));
19static int __memp_mpf_find __P((ENV *,
20    DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **));
21
22/*
23 * __memp_fopen_pp --
24 *	DB_MPOOLFILE->open pre/post processing.
25 *
26 * PUBLIC: int __memp_fopen_pp
27 * PUBLIC:     __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t));
28 */
29int
30__memp_fopen_pp(dbmfp, path, flags, mode, pagesize)
31	DB_MPOOLFILE *dbmfp;
32	const char *path;
33	u_int32_t flags;
34	int mode;
35	size_t pagesize;
36{
37	DB_THREAD_INFO *ip;
38	ENV *env;
39	int ret;
40
41	env = dbmfp->env;
42
43	/* Validate arguments. */
44	if ((ret = __db_fchk(env, "DB_MPOOLFILE->open", flags,
45	    DB_CREATE | DB_DIRECT | DB_EXTENT | DB_MULTIVERSION |
46	    DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
47		return (ret);
48
49	/*
50	 * Require a non-zero, power-of-two pagesize, smaller than the
51	 * clear length.
52	 */
53	if (pagesize == 0 || !POWER_OF_TWO(pagesize)) {
54		__db_errx(env,
55		    "DB_MPOOLFILE->open: page sizes must be a power-of-2");
56		return (EINVAL);
57	}
58	if (dbmfp->clear_len > pagesize) {
59		__db_errx(env,
60		    "DB_MPOOLFILE->open: clear length larger than page size");
61		return (EINVAL);
62	}
63
64	/* Read-only checks, and local flag. */
65	if (LF_ISSET(DB_RDONLY) && path == NULL) {
66		__db_errx(env,
67		    "DB_MPOOLFILE->open: temporary files can't be readonly");
68		return (EINVAL);
69	}
70
71	if (LF_ISSET(DB_MULTIVERSION) && !TXN_ON(env)) {
72		__db_errx(env,
73		   "DB_MPOOLFILE->open: DB_MULTIVERSION requires transactions");
74		return (EINVAL);
75	}
76
77	ENV_ENTER(env, ip);
78	REPLICATION_WRAP(env,
79	    (__memp_fopen(dbmfp, NULL, path, flags, mode, pagesize)), 0, ret);
80	ENV_LEAVE(env, ip);
81	return (ret);
82}
83
84/*
85 * __memp_fopen --
86 *	DB_MPOOLFILE->open.
87 *
88 * PUBLIC: int __memp_fopen __P((DB_MPOOLFILE *,
89 * PUBLIC:     MPOOLFILE *, const char *, u_int32_t, int, size_t));
90 */
91int
92__memp_fopen(dbmfp, mfp, path, flags, mode, pgsize)
93	DB_MPOOLFILE *dbmfp;
94	MPOOLFILE *mfp;
95	const char *path;
96	u_int32_t flags;
97	int mode;
98	size_t pgsize;
99{
100	DB_ENV *dbenv;
101	DB_MPOOL *dbmp;
102	DB_MPOOLFILE *tmp_dbmfp;
103	DB_MPOOL_HASH *hp;
104	ENV *env;
105	MPOOL *mp;
106	MPOOLFILE *alloc_mfp;
107	size_t maxmap;
108	db_pgno_t last_pgno;
109	u_int32_t bucket, mbytes, bytes, oflags, pagesize;
110	int refinc, ret;
111	char *rpath;
112
113	/* If this handle is already open, return. */
114	if (F_ISSET(dbmfp, MP_OPEN_CALLED))
115		return (0);
116
117	env = dbmfp->env;
118	dbmp = env->mp_handle;
119	dbenv = env->dbenv;
120	mp = dbmp->reginfo[0].primary;
121	alloc_mfp = NULL;
122	mbytes = bytes = 0;
123	refinc = ret = 0;
124	rpath = NULL;
125
126	/*
127	 * We're keeping the page size as a size_t in the public API, but
128	 * it's a u_int32_t everywhere internally.
129	 */
130	pagesize = (u_int32_t)pgsize;
131
132	/*
133	 * We're called internally with a specified mfp, in which case the
134	 * path is NULL, but we'll get the path from the underlying region
135	 * information.  Otherwise, if the path is NULL, it's a temporary
136	 * file -- we know we can't join any existing files, and we'll delay
137	 * the open until we actually need to write the file. All temporary
138	 * files will go into the first hash bucket.
139	 */
140	DB_ASSERT(env, mfp == NULL || path == NULL);
141
142	bucket = 0;
143	hp = R_ADDR(dbmp->reginfo, mp->ftab);
144	if (mfp == NULL) {
145		if (path == NULL)
146			goto alloc;
147
148		/*
149		 * Hash to the proper file table entry and walk it.
150		 *
151		 * The fileID is a filesystem unique number (e.g., a
152		 * UNIX dev/inode pair) plus a timestamp.  If files are
153		 * removed and created in less than a second, the fileID
154		 * can be repeated.  The problem with repetition happens
155		 * when the file that previously had the fileID value still
156		 * has pages in the pool, since we don't want to use them
157		 * to satisfy requests for the new file. Because the
158		 * DB_TRUNCATE flag reuses the dev/inode pair, repeated
159		 * opens with that flag set guarantees matching fileIDs
160		 * when the machine can open a file and then re-open
161		 * with truncate within a second.  For this reason, we
162		 * pass that flag down, and, if we find a matching entry,
163		 * we ensure that it's never found again, and we create
164		 * a new entry for the current request.
165		 */
166
167		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
168			bucket = FNBUCKET(path, strlen(path));
169		else
170			bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN);
171		hp += bucket;
172
173		/*
174		 * If we are passed a FILEID find the MPOOLFILE and inc
175		 * its ref count.  That way it cannot go away while we
176		 * open it.
177		 */
178		if (F_ISSET(dbmfp, MP_FILEID_SET)) {
179			MUTEX_LOCK(env, hp->mtx_hash);
180			ret =
181			    __memp_mpf_find(env, dbmfp, hp, path, flags,&mfp);
182			MUTEX_UNLOCK(env, hp->mtx_hash);
183			if (ret != 0)
184				goto err;
185			if (mfp != NULL)
186				refinc = 1;
187		}
188	} else {
189		/*
190		 * Deadfile can only be set if mpf_cnt goes to zero (or if we
191		 * failed creating the file DB_AM_DISCARD).  Increment the ref
192		 * count so the file cannot become dead and be unlinked.
193		 */
194		MUTEX_LOCK(env, mfp->mutex);
195		if (!mfp->deadfile) {
196			++mfp->mpf_cnt;
197			refinc = 1;
198		}
199		MUTEX_UNLOCK(env, mfp->mutex);
200
201		/*
202		 * Test one last time to see if the file is dead -- it may have
203		 * been removed.  This happens when a checkpoint trying to open
204		 * the file to flush a buffer races with the Db::remove method.
205		 * The error will be ignored, so don't output an error message.
206		 */
207		if (mfp->deadfile)
208			return (EINVAL);
209	}
210
211	/*
212	 * If there's no backing file, we can join existing files in the cache,
213	 * but there's nothing to read from disk.
214	 */
215	if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
216		/* Convert MP open flags to DB OS-layer open flags. */
217		oflags = 0;
218		if (LF_ISSET(DB_CREATE))
219			oflags |= DB_OSO_CREATE;
220		if (LF_ISSET(DB_DIRECT))
221			oflags |= DB_OSO_DIRECT;
222		if (LF_ISSET(DB_RDONLY)) {
223			F_SET(dbmfp, MP_READONLY);
224			oflags |= DB_OSO_RDONLY;
225		}
226
227		/*
228		 * XXX
229		 * A grievous layering violation, the DB_DSYNC_DB flag
230		 * was left in the ENV structure and not driven through
231		 * the cache API.  This needs to be fixed when the general
232		 * API configuration is fixed.
233		 */
234		if (F_ISSET(env->dbenv, DB_ENV_DSYNC_DB))
235			oflags |= DB_OSO_DSYNC;
236
237		/*
238		 * Get the real name for this file and open it.
239		 *
240		 * Supply a page size so os_open can decide whether to
241		 * turn buffering off if the DB_DIRECT_DB flag is set.
242		 *
243		 * Acquire the region lock if we're using a path from
244		 * an underlying MPOOLFILE -- there's a race in accessing
245		 * the path name stored in the region, __memp_nameop may
246		 * be simultaneously renaming the file.
247		 */
248		if (mfp != NULL) {
249			MPOOL_SYSTEM_LOCK(env);
250			path = R_ADDR(dbmp->reginfo, mfp->path_off);
251		}
252		if ((ret = __db_appname(env,
253		     DB_APP_DATA, path, 0, NULL, &rpath)) == 0)
254			ret = __os_open(env, rpath,
255			     (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp);
256		if (mfp != NULL)
257			MPOOL_SYSTEM_UNLOCK(env);
258		if (ret != 0)
259			goto err;
260
261		/*
262		 * Cache file handles are shared, and have mutexes to
263		 * protect the underlying file handle across seek and
264		 * read/write calls.
265		 */
266		dbmfp->fhp->ref = 1;
267		if ((ret = __mutex_alloc(env, MTX_MPOOL_FH,
268		     DB_MUTEX_PROCESS_ONLY, &dbmfp->fhp->mtx_fh)) != 0)
269			goto err;
270
271		/*
272		 * Figure out the file's size.
273		 *
274		 * !!!
275		 * We can't use off_t's here, or in any code in the mainline
276		 * library for that matter.  (We have to use them in the
277		 * os stubs, of course, as there are system calls that
278		 * take them as arguments.)  The reason is some customers
279		 * build in environments where an off_t is 32-bits, but
280		 * still run where offsets are 64-bits, and they pay us
281		 * a lot of money.
282		 */
283		if ((ret = __os_ioinfo(
284		    env, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
285			__db_err(env, ret, "%s", rpath);
286			goto err;
287		}
288
289		/*
290		 * Don't permit files that aren't a multiple of the pagesize,
291		 * and find the number of the last page in the file, all the
292		 * time being careful not to overflow 32 bits.
293		 *
294		 * During verify or recovery, we might have to cope with a
295		 * truncated file; if the file size is not a multiple of the
296		 * page size, round down to a page, we'll take care of the
297		 * partial page outside the mpool system.
298		 */
299		DB_ASSERT(env, pagesize != 0);
300		if (bytes % pagesize != 0) {
301			if (LF_ISSET(DB_ODDFILESIZE))
302				bytes -= (u_int32_t)(bytes % pagesize);
303			else {
304				__db_errx(env,
305		    "%s: file size not a multiple of the pagesize", rpath);
306				ret = EINVAL;
307				goto err;
308			}
309		}
310
311		/*
312		 * Get the file id if we weren't given one.  Generated file id's
313		 * don't use timestamps, otherwise there'd be no chance of any
314		 * other process joining the party.  Don't bother looking for
315		 * this id in the hash table, its new.
316		 */
317		if (mfp == NULL && !F_ISSET(dbmfp, MP_FILEID_SET)) {
318			if  ((ret =
319			     __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0)
320				goto err;
321			F_SET(dbmfp, MP_FILEID_SET);
322			goto alloc;
323		}
324	}
325
326	if (mfp != NULL)
327		goto have_mfp;
328
329	/*
330	 * We can race with another process opening the same file when
331	 * we allocate the mpoolfile structure.  We will come back
332	 * here and check the hash table again to see if it has appeared.
333	 * For most files this is not a problem, since the name is locked
334	 * at a higher layer but QUEUE extent files are not locked.
335	 */
336check:	MUTEX_LOCK(env, hp->mtx_hash);
337	if ((ret = __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp) != 0))
338		goto err;
339
340	if (alloc_mfp != NULL && mfp == NULL) {
341		mfp = alloc_mfp;
342		alloc_mfp = NULL;
343		SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile);
344	} else if (mfp != NULL) {
345		/*
346		 * Some things about a file cannot be changed: the clear length,
347		 * page size, or LSN location.  However, if this is an attempt
348		 * to open a named in-memory file, we may not yet have that
349		 * information. so accept uninitialized entries.
350		 *
351		 * The file type can change if the application's pre- and post-
352		 * processing needs change.  For example, an application that
353		 * created a hash subdatabase in a database that was previously
354		 * all btree.
355		 *
356		 * !!!
357		 * We do not check to see if the pgcookie information changed,
358		 * or update it if it is.
359		 */
360		if ((dbmfp->clear_len != DB_CLEARLEN_NOTSET &&
361		    mfp->clear_len != DB_CLEARLEN_NOTSET &&
362		    dbmfp->clear_len != mfp->clear_len) ||
363		    (pagesize != 0 && pagesize != mfp->stat.st_pagesize) ||
364		    (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET &&
365		    mfp->lsn_off != DB_LSN_OFF_NOTSET &&
366		    dbmfp->lsn_offset != mfp->lsn_off)) {
367			__db_errx(env,
368		    "%s: clear length, page size or LSN location changed",
369			    path);
370			MUTEX_UNLOCK(env, hp->mtx_hash);
371			ret = EINVAL;
372			goto err;
373		}
374	}
375
376	MUTEX_UNLOCK(env, hp->mtx_hash);
377	if (alloc_mfp != NULL) {
378		MUTEX_LOCK(env, alloc_mfp->mutex);
379		if ((ret = __memp_mf_discard(dbmp, alloc_mfp)) != 0)
380			goto err;
381	}
382
383	if (mfp == NULL) {
384		/*
385		 * If we didn't find the file and this is an in-memory file,
386		 * then the create flag should be set.
387		 */
388		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) &&
389		    !LF_ISSET(DB_CREATE)) {
390			ret = ENOENT;
391			goto err;
392		}
393
394alloc:		/*
395		 * Get the file ID if we weren't given one.  Generated file
396		 * ID's don't use timestamps, otherwise there'd be no
397		 * chance of any other process joining the party.
398		 */
399		if (path != NULL &&
400		     !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) &&
401		     !F_ISSET(dbmfp, MP_FILEID_SET) && (ret =
402			    __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0)
403				goto err;
404
405		if ((ret = __memp_mpf_alloc(dbmp,
406		     dbmfp, path, pagesize, flags, &alloc_mfp)) != 0)
407			goto err;
408
409		/*
410		 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a
411		 * page get, we have to increment the last page in the file.
412		 * Figure it out and save it away.
413		 *
414		 * Note correction: page numbers are zero-based, not 1-based.
415		 */
416		DB_ASSERT(env, pagesize != 0);
417		last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize));
418		last_pgno += (db_pgno_t)(bytes / pagesize);
419		if (last_pgno != 0)
420			--last_pgno;
421
422		alloc_mfp->last_flushed_pgno = alloc_mfp->orig_last_pgno =
423		    alloc_mfp->last_pgno = last_pgno;
424
425		alloc_mfp->bucket = bucket;
426
427		/* Go back and see if someone else has opened the file. */
428		if (path != NULL)
429			goto check;
430
431		mfp = alloc_mfp;
432		/* This is a temp, noone else can see it, put it at the end. */
433		MUTEX_LOCK(env, hp->mtx_hash);
434		SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, mfp, q);
435		MUTEX_UNLOCK(env, hp->mtx_hash);
436	}
437have_mfp:
438	/*
439	 * We need to verify that all handles open a file either durable or not
440	 * durable.  This needs to be cross process and cross sub-databases, so
441	 * mpool is the place to do it.
442	 */
443	if (!LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) {
444		if (F_ISSET(mfp, MP_DURABLE_UNKNOWN)) {
445			if (LF_ISSET(MP_NOT_DURABLE))
446				F_SET(mfp, MP_NOT_DURABLE);
447			F_CLR(mfp, MP_DURABLE_UNKNOWN);
448		} else if (!LF_ISSET(DB_TXN_NOT_DURABLE) !=
449		    !F_ISSET(mfp, MP_NOT_DURABLE)) {
450			__db_errx(env,
451	     "Cannot open DURABLE and NOT DURABLE handles in the same file");
452			ret = EINVAL;
453			goto err;
454		}
455	}
456
457	if (LF_ISSET(DB_MULTIVERSION)) {
458		++mfp->multiversion;
459		F_SET(dbmfp, MP_MULTIVERSION);
460	}
461
462	/*
463	 * All paths to here have initialized the mfp variable to reference
464	 * the selected (or allocated) MPOOLFILE.
465	 */
466	dbmfp->mfp = mfp;
467
468	/*
469	 * Check to see if we can mmap the file.  If a file:
470	 *	+ isn't temporary
471	 *	+ is read-only
472	 *	+ doesn't require any pgin/pgout support
473	 *	+ the DB_NOMMAP flag wasn't set (in either the file open or
474	 *	  the environment in which it was opened)
475	 *	+ and is less than mp_mmapsize bytes in size
476	 *
477	 * we can mmap it instead of reading/writing buffers.  Don't do error
478	 * checking based on the mmap call failure.  We want to do normal I/O
479	 * on the file if the reason we failed was because the file was on an
480	 * NFS mounted partition, and we can fail in buffer I/O just as easily
481	 * as here.
482	 *
483	 * We'd like to test to see if the file is too big to mmap.  Since we
484	 * don't know what size or type off_t's or size_t's are, or the largest
485	 * unsigned integral type is, or what random insanity the local C
486	 * compiler will perpetrate, doing the comparison in a portable way is
487	 * flatly impossible.  Hope that mmap fails if the file is too large.
488	 */
489#define	DB_MAXMMAPSIZE	(10 * 1024 * 1024)	/* 10 MB. */
490	if (F_ISSET(mfp, MP_CAN_MMAP)) {
491		maxmap = dbenv->mp_mmapsize == 0 ?
492		    DB_MAXMMAPSIZE : dbenv->mp_mmapsize;
493		if (path == NULL ||
494		    FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
495			F_CLR(mfp, MP_CAN_MMAP);
496		else if (!F_ISSET(dbmfp, MP_READONLY))
497			F_CLR(mfp, MP_CAN_MMAP);
498		else if (dbmfp->ftype != 0)
499			F_CLR(mfp, MP_CAN_MMAP);
500		else if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP))
501			F_CLR(mfp, MP_CAN_MMAP);
502		else {
503			MPOOL_SYSTEM_LOCK(env);
504			maxmap = mp->mp_mmapsize == 0 ?
505			    DB_MAXMMAPSIZE : mp->mp_mmapsize;
506			MPOOL_SYSTEM_UNLOCK(env);
507			if (mbytes > maxmap / MEGABYTE ||
508			    (mbytes == maxmap / MEGABYTE &&
509			    bytes >= maxmap % MEGABYTE))
510				F_CLR(mfp, MP_CAN_MMAP);
511		}
512
513		dbmfp->addr = NULL;
514		if (F_ISSET(mfp, MP_CAN_MMAP)) {
515			dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
516			if (__os_mapfile(env, rpath,
517			    dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) {
518				dbmfp->addr = NULL;
519				F_CLR(mfp, MP_CAN_MMAP);
520			}
521		}
522	}
523
524	F_SET(dbmfp, MP_OPEN_CALLED);
525
526	/*
527	 * Share the underlying file descriptor if that's possible.
528	 *
529	 * Add the file to the process' list of DB_MPOOLFILEs.
530	 */
531	MUTEX_LOCK(env, dbmp->mutex);
532
533	if (dbmfp->fhp != NULL)
534		TAILQ_FOREACH(tmp_dbmfp, &dbmp->dbmfq, q)
535			if (dbmfp->mfp == tmp_dbmfp->mfp &&
536			    (F_ISSET(dbmfp, MP_READONLY) ||
537			    !F_ISSET(tmp_dbmfp, MP_READONLY))) {
538				(void)__mutex_free(env, &dbmfp->fhp->mtx_fh);
539				(void)__os_closehandle(env, dbmfp->fhp);
540				++tmp_dbmfp->fhp->ref;
541				dbmfp->fhp = tmp_dbmfp->fhp;
542				break;
543			}
544
545	TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
546
547	MUTEX_UNLOCK(env, dbmp->mutex);
548
549	if (0) {
550err:		if (refinc) {
551			/*
552			 * If mpf_cnt goes to zero here and unlink_on_close is
553			 * set, then we missed the last close, but there was an
554			 * error trying to open the file, so we probably cannot
555			 * unlink it anyway.
556			 */
557			MUTEX_LOCK(env, mfp->mutex);
558			--mfp->mpf_cnt;
559			MUTEX_UNLOCK(env, mfp->mutex);
560		}
561
562	}
563	if (rpath != NULL)
564		__os_free(env, rpath);
565	return (ret);
566}
567
568/*
569 * __memp_mpf_find --
570 *	Search a hash bucket for a MPOOLFILE.
571 */
572static int
573__memp_mpf_find(env, dbmfp, hp, path, flags, mfpp)
574	ENV *env;
575	DB_MPOOLFILE *dbmfp;
576	DB_MPOOL_HASH *hp;
577	const char *path;
578	u_int32_t flags;
579	MPOOLFILE **mfpp;
580{
581	DB_MPOOL *dbmp;
582	MPOOLFILE *mfp;
583
584	dbmp = env->mp_handle;
585
586	SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
587		/* Skip dead files and temporary files. */
588		if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
589			continue;
590
591		/*
592		 * Any remaining DB_MPOOL_NOFILE databases are in-memory
593		 * named databases and need only match other in-memory
594		 * databases with the same name.
595		 */
596		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
597			if (!mfp->no_backing_file)
598				continue;
599
600			if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off)))
601				continue;
602
603			/*
604			 * We matched an in-memory file; grab the fileid if
605			 * it is set in the region, but not in the dbmfp.
606			 */
607			if (!F_ISSET(dbmfp, MP_FILEID_SET))
608				(void)__memp_set_fileid(dbmfp,
609				    R_ADDR(dbmp->reginfo, mfp->fileid_off));
610		} else
611			if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
612			    mfp->fileid_off), DB_FILE_ID_LEN) != 0)
613				continue;
614
615		/*
616		 * If the file is being truncated, remove it from the system
617		 * and create a new entry.
618		 *
619		 * !!!
620		 * We should be able to set mfp to NULL and break out of the
621		 * loop, but I like the idea of checking all the entries.
622		 */
623		if (LF_ISSET(DB_TRUNCATE)) {
624			MUTEX_LOCK(env, mfp->mutex);
625			mfp->deadfile = 1;
626			MUTEX_UNLOCK(env, mfp->mutex);
627			continue;
628		}
629
630		/*
631		 * Check to see if this file has died while we waited.
632		 *
633		 * We normally don't lock the deadfile field when we read it as
634		 * we only care if the field is zero or non-zero.  We do lock
635		 * on read when searching for a matching MPOOLFILE so that two
636		 * threads of control don't race between setting the deadfile
637		 * bit and incrementing the reference count, that is, a thread
638		 * of control decrementing the reference count and then setting
639		 * deadfile because the reference count is 0 blocks us finding
640		 * the file without knowing it's about to be marked dead.
641		 */
642		MUTEX_LOCK(env, mfp->mutex);
643		if (mfp->deadfile) {
644			MUTEX_UNLOCK(env, mfp->mutex);
645			continue;
646		}
647		++mfp->mpf_cnt;
648		MUTEX_UNLOCK(env, mfp->mutex);
649
650		/* Initialize any fields that are not yet set. */
651		if (dbmfp->ftype != 0)
652			mfp->ftype = dbmfp->ftype;
653		if (dbmfp->clear_len != DB_CLEARLEN_NOTSET)
654			mfp->clear_len = dbmfp->clear_len;
655		if (dbmfp->lsn_offset != -1)
656			mfp->lsn_off = dbmfp->lsn_offset;
657
658		break;
659	}
660
661	*mfpp = mfp;
662	return (0);
663}
664
665static int
666__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
667	DB_MPOOL *dbmp;
668	DB_MPOOLFILE *dbmfp;
669	const char *path;
670	u_int32_t pagesize;
671	u_int32_t flags;
672	MPOOLFILE **retmfp;
673{
674	ENV *env;
675	MPOOLFILE *mfp;
676	int ret;
677	void *p;
678
679	env = dbmp->env;
680	ret = 0;
681	/* Allocate and initialize a new MPOOLFILE. */
682	if ((ret = __memp_alloc(dbmp,
683	     dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
684		goto err;
685	memset(mfp, 0, sizeof(MPOOLFILE));
686	mfp->mpf_cnt = 1;
687	mfp->ftype = dbmfp->ftype;
688	mfp->stat.st_pagesize = pagesize;
689	mfp->lsn_off = dbmfp->lsn_offset;
690	mfp->clear_len = dbmfp->clear_len;
691	mfp->priority = dbmfp->priority;
692	if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) {
693		mfp->maxpgno = (db_pgno_t)
694		    (dbmfp->gbytes * (GIGABYTE / mfp->stat.st_pagesize));
695		mfp->maxpgno += (db_pgno_t)
696		    ((dbmfp->bytes + mfp->stat.st_pagesize - 1) /
697		    mfp->stat.st_pagesize);
698	}
699	if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
700		mfp->no_backing_file = 1;
701	if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK))
702		mfp->unlink_on_close = 1;
703
704	if (LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY))
705		F_SET(mfp, MP_DURABLE_UNKNOWN);
706	if (LF_ISSET(DB_DIRECT))
707		F_SET(mfp, MP_DIRECT);
708	if (LF_ISSET(DB_EXTENT))
709		F_SET(mfp, MP_EXTENT);
710	if (LF_ISSET(DB_TXN_NOT_DURABLE))
711		F_SET(mfp, MP_NOT_DURABLE);
712	F_SET(mfp, MP_CAN_MMAP);
713
714	/*
715	 * An in-memory database with no name is a temp file.  Named
716	 * in-memory databases get an artificially  bumped reference
717	 * count so they don't disappear on close; they need a remove
718	 * to make them disappear.
719	 */
720	if (path == NULL)
721		F_SET(mfp, MP_TEMP);
722	else if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
723		mfp->mpf_cnt++;
724
725	/* Copy the file identification string into shared memory. */
726	if (F_ISSET(dbmfp, MP_FILEID_SET)) {
727		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
728		    NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
729			goto err;
730		memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN);
731	}
732
733	/* Copy the file path into shared memory. */
734	if (path != NULL) {
735		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
736		    NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
737			goto err;
738		memcpy(p, path, strlen(path) + 1);
739	}
740
741	/* Copy the page cookie into shared memory. */
742	if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) {
743		mfp->pgcookie_len = 0;
744		mfp->pgcookie_off = 0;
745	} else {
746		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
747		    NULL, dbmfp->pgcookie->size,
748		    &mfp->pgcookie_off, &p)) != 0)
749			goto err;
750		memcpy(p,
751		     dbmfp->pgcookie->data, dbmfp->pgcookie->size);
752		mfp->pgcookie_len = dbmfp->pgcookie->size;
753	}
754
755	if ((ret = __mutex_alloc(env,
756	    MTX_MPOOLFILE_HANDLE, 0, &mfp->mutex)) != 0)
757		goto err;
758	*retmfp = mfp;
759
760err:	return (ret);
761}
762
763/*
764 * memp_fclose_pp --
765 *	DB_MPOOLFILE->close pre/post processing.
766 *
767 * PUBLIC: int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t));
768 */
769int
770__memp_fclose_pp(dbmfp, flags)
771	DB_MPOOLFILE *dbmfp;
772	u_int32_t flags;
773{
774	DB_THREAD_INFO *ip;
775	ENV *env;
776	int ret;
777
778	env = dbmfp->env;
779
780	/*
781	 * Validate arguments, but as a handle destructor, we can't fail.
782	 */
783	if (flags != 0)
784		(void)__db_ferr(env, "DB_MPOOLFILE->close", 0);
785
786	ENV_ENTER(env, ip);
787	REPLICATION_WRAP(env, (__memp_fclose(dbmfp, 0)), 0, ret);
788	ENV_LEAVE(env, ip);
789	return (ret);
790}
791
792/*
793 * __memp_fclose --
794 *	DB_MPOOLFILE->close.
795 *
796 * PUBLIC: int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
797 */
798int
799__memp_fclose(dbmfp, flags)
800	DB_MPOOLFILE *dbmfp;
801	u_int32_t flags;
802{
803	DB_MPOOL *dbmp;
804	ENV *env;
805	MPOOLFILE *mfp;
806	char *rpath;
807	u_int32_t ref;
808	int deleted, ret, t_ret;
809
810	env = dbmfp->env;
811	dbmp = env->mp_handle;
812	ret = 0;
813
814	/*
815	 * Remove the DB_MPOOLFILE from the process' list.
816	 *
817	 * It's possible the underlying mpool cache may never have been created.
818	 * In that case, all we have is a structure, discard it.
819	 *
820	 * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE
821	 * file list, check the MP_OPEN_CALLED flag to be sure.
822	 */
823	if (dbmp == NULL)
824		goto done;
825
826	MUTEX_LOCK(env, dbmp->mutex);
827
828	DB_ASSERT(env, dbmfp->ref >= 1);
829	if ((ref = --dbmfp->ref) == 0 && F_ISSET(dbmfp, MP_OPEN_CALLED))
830		TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
831
832	/*
833	 * Decrement the file descriptor's ref count -- if we're the last ref,
834	 * we'll discard the file descriptor.
835	 */
836	if (ref == 0 && dbmfp->fhp != NULL && --dbmfp->fhp->ref > 0)
837		dbmfp->fhp = NULL;
838	MUTEX_UNLOCK(env, dbmp->mutex);
839	if (ref != 0)
840		return (0);
841
842	/* Complain if pinned blocks never returned. */
843	if (dbmfp->pinref != 0) {
844		__db_errx(env, "%s: close: %lu blocks left pinned",
845		    __memp_fn(dbmfp), (u_long)dbmfp->pinref);
846		ret = __env_panic(env, DB_RUNRECOVERY);
847	}
848
849	/* Discard any mmap information. */
850	if (dbmfp->addr != NULL &&
851	    (ret = __os_unmapfile(env, dbmfp->addr, dbmfp->len)) != 0)
852		__db_err(env, ret, "%s", __memp_fn(dbmfp));
853
854	/*
855	 * Close the file and discard the descriptor structure; temporary
856	 * files may not yet have been created.
857	 */
858	if (dbmfp->fhp != NULL) {
859		if ((t_ret =
860		    __mutex_free(env, &dbmfp->fhp->mtx_fh)) != 0 && ret == 0)
861			ret = t_ret;
862		if ((t_ret = __os_closehandle(env, dbmfp->fhp)) != 0) {
863			__db_err(env, t_ret, "%s", __memp_fn(dbmfp));
864			if (ret == 0)
865				ret = t_ret;
866		}
867		dbmfp->fhp = NULL;
868	}
869
870	/*
871	 * Discard our reference on the underlying MPOOLFILE, and close it
872	 * if it's no longer useful to anyone.  It possible the open of the
873	 * file never happened or wasn't successful, in which case, mpf will
874	 * be NULL and MP_OPEN_CALLED will not be set.
875	 */
876	mfp = dbmfp->mfp;
877	DB_ASSERT(env,
878	    (F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp != NULL) ||
879	    (!F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp == NULL));
880	if (!F_ISSET(dbmfp, MP_OPEN_CALLED))
881		goto done;
882
883	/*
884	 * If it's a temp file, all outstanding references belong to unflushed
885	 * buffers.  (A temp file can only be referenced by one DB_MPOOLFILE).
886	 * We don't care about preserving any of those buffers, so mark the
887	 * MPOOLFILE as dead so that even the dirty ones just get discarded
888	 * when we try to flush them.
889	 */
890	deleted = 0;
891	if (!LF_ISSET(DB_MPOOL_NOLOCK))
892		MUTEX_LOCK(env, mfp->mutex);
893	if (F_ISSET(dbmfp, MP_MULTIVERSION))
894		--mfp->multiversion;
895	if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) {
896		if (LF_ISSET(DB_MPOOL_DISCARD) ||
897		    F_ISSET(mfp, MP_TEMP) || mfp->unlink_on_close) {
898			mfp->deadfile = 1;
899		}
900		if (mfp->unlink_on_close) {
901			if ((t_ret = __db_appname(dbmp->env,
902			    DB_APP_DATA, R_ADDR(dbmp->reginfo,
903			    mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0)
904				ret = t_ret;
905			if (t_ret == 0) {
906				if ((t_ret = __os_unlink(
907				    dbmp->env, rpath, 0)) != 0 && ret == 0)
908					ret = t_ret;
909				__os_free(env, rpath);
910			}
911		}
912		if (mfp->block_cnt == 0) {
913			/*
914			 * We should never discard this mp file if our caller
915			 * is holding the lock on it.  See comment in
916			 * __memp_sync_file.
917			 */
918			DB_ASSERT(env, !LF_ISSET(DB_MPOOL_NOLOCK));
919			if ((t_ret =
920			    __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0)
921				ret = t_ret;
922			deleted = 1;
923		}
924	}
925	if (!deleted && !LF_ISSET(DB_MPOOL_NOLOCK))
926		MUTEX_UNLOCK(env, mfp->mutex);
927
928done:	/* Discard the DB_MPOOLFILE structure. */
929	if (dbmfp->pgcookie != NULL) {
930		__os_free(env, dbmfp->pgcookie->data);
931		__os_free(env, dbmfp->pgcookie);
932	}
933	__os_free(env, dbmfp);
934
935	return (ret);
936}
937
938/*
939 * __memp_mf_discard --
940 *	Discard an MPOOLFILE.
941 *
942 * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *));
943 */
944int
945__memp_mf_discard(dbmp, mfp)
946	DB_MPOOL *dbmp;
947	MPOOLFILE *mfp;
948{
949	DB_MPOOL_HASH *hp;
950	ENV *env;
951#ifdef HAVE_STATISTICS
952	DB_MPOOL_STAT *sp;
953#endif
954	MPOOL *mp;
955	int need_sync, ret, t_ret;
956
957	env = dbmp->env;
958	mp = dbmp->reginfo[0].primary;
959	hp = R_ADDR(dbmp->reginfo, mp->ftab);
960	hp += mfp->bucket;
961	ret = 0;
962
963	/*
964	 * Expects caller to be holding the MPOOLFILE mutex.
965	 *
966	 * When discarding a file, we have to flush writes from it to disk.
967	 * The scenario is that dirty buffers from this file need to be
968	 * flushed to satisfy a future checkpoint, but when the checkpoint
969	 * calls mpool sync, the sync code won't know anything about them.
970	 * Ignore files not written, discarded, or only temporary.
971	 */
972	need_sync =
973	   mfp->file_written && !mfp->deadfile && !F_ISSET(mfp, MP_TEMP);
974
975	/*
976	 * We have to release the MPOOLFILE mutex before acquiring the region
977	 * mutex so we don't deadlock.  Make sure nobody ever looks at this
978	 * structure again.
979	 */
980	mfp->deadfile = 1;
981
982	/* Discard the mutex we're holding and return it too the pool. */
983	MUTEX_UNLOCK(env, mfp->mutex);
984	if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0)
985		ret = t_ret;
986
987	/* Lock the bucket and delete from the list of MPOOLFILEs. */
988	MUTEX_LOCK(env, hp->mtx_hash);
989	SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile);
990	MUTEX_UNLOCK(env, hp->mtx_hash);
991
992	/* Lock the region and collect stats and free the space. */
993	MPOOL_SYSTEM_LOCK(env);
994	if (need_sync &&
995	    (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0)
996		ret = t_ret;
997
998#ifdef HAVE_STATISTICS
999	/* Copy the statistics into the region. */
1000	sp = &mp->stat;
1001	sp->st_cache_hit += mfp->stat.st_cache_hit;
1002	sp->st_cache_miss += mfp->stat.st_cache_miss;
1003	sp->st_map += mfp->stat.st_map;
1004	sp->st_page_create += mfp->stat.st_page_create;
1005	sp->st_page_in += mfp->stat.st_page_in;
1006	sp->st_page_out += mfp->stat.st_page_out;
1007#endif
1008
1009	/* Free the space. */
1010	if (mfp->path_off != 0)
1011		__memp_free(&dbmp->reginfo[0], NULL,
1012		    R_ADDR(dbmp->reginfo, mfp->path_off));
1013	if (mfp->fileid_off != 0)
1014		__memp_free(&dbmp->reginfo[0], NULL,
1015		    R_ADDR(dbmp->reginfo, mfp->fileid_off));
1016	if (mfp->pgcookie_off != 0)
1017		__memp_free(&dbmp->reginfo[0], NULL,
1018		    R_ADDR(dbmp->reginfo, mfp->pgcookie_off));
1019	__memp_free(&dbmp->reginfo[0], NULL, mfp);
1020
1021	MPOOL_SYSTEM_UNLOCK(env);
1022
1023	return (ret);
1024}
1025
1026/*
1027 * __memp_inmemlist --
1028 *	Return a list of the named in-memory databases.
1029 *
1030 * PUBLIC: int __memp_inmemlist __P((ENV *, char ***, int *));
1031 */
1032int
1033__memp_inmemlist(env, namesp, cntp)
1034	ENV *env;
1035	char ***namesp;
1036	int *cntp;
1037{
1038	DB_MPOOL *dbmp;
1039	DB_MPOOL_HASH *hp;
1040	MPOOL *mp;
1041	MPOOLFILE *mfp;
1042	int arraysz, cnt, i, ret;
1043	char **names;
1044
1045	names = NULL;
1046	dbmp = env->mp_handle;
1047	mp = dbmp->reginfo[0].primary;
1048	hp = R_ADDR(dbmp->reginfo, mp->ftab);
1049
1050	arraysz = cnt = 0;
1051	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
1052		MUTEX_LOCK(env, hp->mtx_hash);
1053		SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
1054			/* Skip dead files and temporary files. */
1055			if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
1056				continue;
1057
1058			/* Skip entries that allow files. */
1059			if (!mfp->no_backing_file)
1060				continue;
1061
1062			/* We found one. */
1063			if (cnt >= arraysz) {
1064				arraysz += 100;
1065				if ((ret = __os_realloc(env,
1066				    (u_int)arraysz * sizeof(names[0]),
1067				    &names)) != 0)
1068					goto nomem;
1069			}
1070			if ((ret = __os_strdup(env,
1071			    R_ADDR(dbmp->reginfo, mfp->path_off),
1072			    &names[cnt])) != 0)
1073				goto nomem;
1074
1075			cnt++;
1076		}
1077		MUTEX_UNLOCK(env, hp->mtx_hash);
1078	}
1079	*namesp = names;
1080	*cntp = cnt;
1081	return (0);
1082
1083nomem:	MUTEX_UNLOCK(env, hp->mtx_hash);
1084	if (names != NULL) {
1085		while (--cnt >= 0)
1086			__os_free(env, names[cnt]);
1087		__os_free(env, names);
1088	}
1089
1090	/* Make sure we don't return any garbage. */
1091	*cntp = 0;
1092	*namesp = NULL;
1093	return (ret);
1094}
1095