1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/mp.h"
13#include "dbinc/db_page.h"
14#include "dbinc/hash.h"
15
16/*
17 * __memp_env_create --
18 *	Mpool specific creation of the DB_ENV structure.
19 *
20 * PUBLIC: int __memp_env_create __P((DB_ENV *));
21 */
22int
23__memp_env_create(dbenv)
24	DB_ENV *dbenv;
25{
26	/*
27	 * !!!
28	 * Our caller has not yet had the opportunity to reset the panic
29	 * state or turn off mutex locking, and so we can neither check
30	 * the panic state or acquire a mutex in the DB_ENV create path.
31	 *
32	 * We default to 32 8K pages.  We don't default to a flat 256K, because
33	 * some systems require significantly more memory to hold 32 pages than
34	 * others.  For example, HP-UX with POSIX pthreads needs 88 bytes for
35	 * a POSIX pthread mutex and almost 200 bytes per buffer header, while
36	 * Solaris needs 24 and 52 bytes for the same structures.  The minimum
37	 * number of hash buckets is 37.  These contain a mutex also.
38	 */
39	dbenv->mp_bytes = dbenv->mp_max_bytes =
40	    32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH);
41	dbenv->mp_ncache = 1;
42
43	return (0);
44}
45
46/*
47 * __memp_env_destroy --
48 *	Mpool specific destruction of the DB_ENV structure.
49 *
50 * PUBLIC: void __memp_env_destroy __P((DB_ENV *));
51 */
52void
53__memp_env_destroy(dbenv)
54	DB_ENV *dbenv;
55{
56	COMPQUIET(dbenv, NULL);
57}
58
59/*
60 * __memp_get_cachesize --
61 *	{DB_ENV,DB}->get_cachesize.
62 *
63 * PUBLIC: int __memp_get_cachesize
64 * PUBLIC:         __P((DB_ENV *, u_int32_t *, u_int32_t *, int *));
65 */
66int
67__memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep)
68	DB_ENV *dbenv;
69	u_int32_t *gbytesp, *bytesp;
70	int *ncachep;
71{
72	ENV *env;
73	MPOOL *mp;
74
75	env = dbenv->env;
76
77	ENV_NOT_CONFIGURED(env,
78	    env->mp_handle, "DB_ENV->get_cachesize", DB_INIT_MPOOL);
79
80	if (MPOOL_ON(env)) {
81		/* Cannot be set after open, no lock required to read. */
82		mp = env->mp_handle->reginfo[0].primary;
83		if (gbytesp != NULL)
84			*gbytesp = mp->stat.st_gbytes;
85		if (bytesp != NULL)
86			*bytesp = mp->stat.st_bytes;
87		if (ncachep != NULL)
88			*ncachep = (int)mp->nreg;
89	} else {
90		if (gbytesp != NULL)
91			*gbytesp = dbenv->mp_gbytes;
92		if (bytesp != NULL)
93			*bytesp = dbenv->mp_bytes;
94		if (ncachep != NULL)
95			*ncachep = (int)dbenv->mp_ncache;
96	}
97	return (0);
98}
99
100/*
101 * __memp_set_cachesize --
102 *	{DB_ENV,DB}->set_cachesize.
103 *
104 * PUBLIC: int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
105 */
106int
107__memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
108	DB_ENV *dbenv;
109	u_int32_t gbytes, bytes;
110	int arg_ncache;
111{
112	ENV *env;
113	u_int ncache;
114
115	env = dbenv->env;
116
117	/* Normalize the cache count. */
118	ncache = arg_ncache <= 0 ? 1 : (u_int)arg_ncache;
119
120	/*
121	 * You can only store 4GB-1 in an unsigned 32-bit value, so correct for
122	 * applications that specify 4GB cache sizes -- we know what they meant.
123	 */
124	if (sizeof(roff_t) == 4 && gbytes / ncache == 4 && bytes == 0) {
125		--gbytes;
126		bytes = GIGABYTE - 1;
127	} else {
128		gbytes += bytes / GIGABYTE;
129		bytes %= GIGABYTE;
130	}
131
132	/*
133	 * !!!
134	 * With 32-bit region offsets, individual cache regions must be smaller
135	 * than 4GB.  Also, cache sizes larger than 10TB would cause 32-bit
136	 * wrapping in the calculation of the number of hash buckets.  See
137	 * __memp_open for details.
138	 */
139	if (!F_ISSET(env, ENV_OPEN_CALLED)) {
140		if (sizeof(roff_t) <= 4 && gbytes / ncache >= 4) {
141			__db_errx(env,
142			    "individual cache size too large: maximum is 4GB");
143			return (EINVAL);
144		}
145		if (gbytes / ncache > 10000) {
146			__db_errx(env,
147			    "individual cache size too large: maximum is 10TB");
148			return (EINVAL);
149		}
150	}
151
152	/*
153	 * If the application requested less than 500Mb, increase the cachesize
154	 * by 25% and factor in the size of the hash buckets to account for our
155	 * overhead.  (I'm guessing caches over 500Mb are specifically sized,
156	 * that is, it's a large server and the application actually knows how
157	 * much memory is available.  We only document the 25% overhead number,
158	 * not the hash buckets, but I don't see a reason to confuse the issue,
159	 * it shouldn't matter to an application.)
160	 *
161	 * There is a minimum cache size, regardless.
162	 */
163	if (gbytes == 0) {
164		if (bytes < 500 * MEGABYTE)
165			bytes += (bytes / 4) + 37 * sizeof(DB_MPOOL_HASH);
166		if (bytes / ncache < DB_CACHESIZE_MIN)
167			bytes = ncache * DB_CACHESIZE_MIN;
168	}
169
170	if (F_ISSET(env, ENV_OPEN_CALLED))
171		return (__memp_resize(env->mp_handle, gbytes, bytes));
172
173	dbenv->mp_gbytes = gbytes;
174	dbenv->mp_bytes = bytes;
175	dbenv->mp_ncache = ncache;
176
177	return (0);
178}
179
180/*
181 * __memp_set_config --
182 *	Set the cache subsystem configuration.
183 *
184 * PUBLIC: int __memp_set_config __P((DB_ENV *, u_int32_t, int));
185 */
186int
187__memp_set_config(dbenv, which, on)
188	DB_ENV *dbenv;
189	u_int32_t which;
190	int on;
191{
192	DB_MPOOL *dbmp;
193	ENV *env;
194	MPOOL *mp;
195
196	env = dbenv->env;
197
198	ENV_NOT_CONFIGURED(env,
199	    env->mp_handle, "DB_ENV->memp_set_config", DB_INIT_MPOOL);
200
201	switch (which) {
202	case DB_MEMP_SUPPRESS_WRITE:
203	case DB_MEMP_SYNC_INTERRUPT:
204		if (MPOOL_ON(env)) {
205			dbmp = env->mp_handle;
206			mp = dbmp->reginfo[0].primary;
207			if (on)
208				FLD_SET(mp->config_flags, which);
209			else
210				FLD_CLR(mp->config_flags, which);
211		}
212		break;
213	default:
214		return (EINVAL);
215	}
216	return (0);
217}
218
219/*
220 * __memp_get_config --
221 *	Return the cache subsystem configuration.
222 *
223 * PUBLIC: int __memp_get_config __P((DB_ENV *, u_int32_t, int *));
224 */
225int
226__memp_get_config(dbenv, which, onp)
227	DB_ENV *dbenv;
228	u_int32_t which;
229	int *onp;
230{
231	DB_MPOOL *dbmp;
232	ENV *env;
233	MPOOL *mp;
234
235	env = dbenv->env;
236
237	ENV_REQUIRES_CONFIG(env,
238	    env->mp_handle, "DB_ENV->memp_get_config", DB_INIT_MPOOL);
239
240	switch (which) {
241	case DB_MEMP_SUPPRESS_WRITE:
242	case DB_MEMP_SYNC_INTERRUPT:
243		if (MPOOL_ON(env)) {
244			dbmp = env->mp_handle;
245			mp = dbmp->reginfo[0].primary;
246			*onp = FLD_ISSET(mp->config_flags, which) ? 1 : 0;
247		} else
248			*onp = 0;
249		break;
250	default:
251		return (EINVAL);
252	}
253	return (0);
254}
255
256/*
257 * PUBLIC: int __memp_get_mp_max_openfd __P((DB_ENV *, int *));
258 */
259int
260__memp_get_mp_max_openfd(dbenv, maxopenfdp)
261	DB_ENV *dbenv;
262	int *maxopenfdp;
263{
264	DB_MPOOL *dbmp;
265	DB_THREAD_INFO *ip;
266	ENV *env;
267	MPOOL *mp;
268
269	env = dbenv->env;
270
271	ENV_NOT_CONFIGURED(env,
272	    env->mp_handle, "DB_ENV->get_mp_max_openfd", DB_INIT_MPOOL);
273
274	if (MPOOL_ON(env)) {
275		dbmp = env->mp_handle;
276		mp = dbmp->reginfo[0].primary;
277		ENV_ENTER(env, ip);
278		MPOOL_SYSTEM_LOCK(env);
279		*maxopenfdp = mp->mp_maxopenfd;
280		MPOOL_SYSTEM_UNLOCK(env);
281		ENV_LEAVE(env, ip);
282	} else
283		*maxopenfdp = dbenv->mp_maxopenfd;
284	return (0);
285}
286
287/*
288 * __memp_set_mp_max_openfd --
289 *	Set the maximum number of open fd's when flushing the cache.
290 * PUBLIC: int __memp_set_mp_max_openfd __P((DB_ENV *, int));
291 */
292int
293__memp_set_mp_max_openfd(dbenv, maxopenfd)
294	DB_ENV *dbenv;
295	int maxopenfd;
296{
297	DB_MPOOL *dbmp;
298	DB_THREAD_INFO *ip;
299	ENV *env;
300	MPOOL *mp;
301
302	env = dbenv->env;
303
304	ENV_NOT_CONFIGURED(env,
305	    env->mp_handle, "DB_ENV->set_mp_max_openfd", DB_INIT_MPOOL);
306
307	if (MPOOL_ON(env)) {
308		dbmp = env->mp_handle;
309		mp = dbmp->reginfo[0].primary;
310		ENV_ENTER(env, ip);
311		MPOOL_SYSTEM_LOCK(env);
312		mp->mp_maxopenfd = maxopenfd;
313		MPOOL_SYSTEM_UNLOCK(env);
314		ENV_LEAVE(env, ip);
315	} else
316		dbenv->mp_maxopenfd = maxopenfd;
317	return (0);
318}
319
320/*
321 * PUBLIC: int __memp_get_mp_max_write __P((DB_ENV *, int *, db_timeout_t *));
322 */
323int
324__memp_get_mp_max_write(dbenv, maxwritep, maxwrite_sleepp)
325	DB_ENV *dbenv;
326	int *maxwritep;
327	db_timeout_t *maxwrite_sleepp;
328{
329	DB_MPOOL *dbmp;
330	DB_THREAD_INFO *ip;
331	ENV *env;
332	MPOOL *mp;
333
334	env = dbenv->env;
335
336	ENV_NOT_CONFIGURED(env,
337	    env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL);
338
339	if (MPOOL_ON(env)) {
340		dbmp = env->mp_handle;
341		mp = dbmp->reginfo[0].primary;
342		ENV_ENTER(env, ip);
343		MPOOL_SYSTEM_LOCK(env);
344		*maxwritep = mp->mp_maxwrite;
345		*maxwrite_sleepp = mp->mp_maxwrite_sleep;
346		MPOOL_SYSTEM_UNLOCK(env);
347		ENV_LEAVE(env, ip);
348	} else {
349		*maxwritep = dbenv->mp_maxwrite;
350		*maxwrite_sleepp = dbenv->mp_maxwrite_sleep;
351	}
352	return (0);
353}
354
355/*
356 * __memp_set_mp_max_write --
357 *	Set the maximum continuous I/O count.
358 *
359 * PUBLIC: int __memp_set_mp_max_write __P((DB_ENV *, int, db_timeout_t));
360 */
361int
362__memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep)
363	DB_ENV *dbenv;
364	int maxwrite;
365	db_timeout_t maxwrite_sleep;
366{
367	DB_MPOOL *dbmp;
368	DB_THREAD_INFO *ip;
369	ENV *env;
370	MPOOL *mp;
371
372	env = dbenv->env;
373
374	ENV_NOT_CONFIGURED(env,
375	    env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL);
376
377	if (MPOOL_ON(env)) {
378		dbmp = env->mp_handle;
379		mp = dbmp->reginfo[0].primary;
380		ENV_ENTER(env, ip);
381		MPOOL_SYSTEM_LOCK(env);
382		mp->mp_maxwrite = maxwrite;
383		mp->mp_maxwrite_sleep = maxwrite_sleep;
384		MPOOL_SYSTEM_UNLOCK(env);
385		ENV_LEAVE(env, ip);
386	} else {
387		dbenv->mp_maxwrite = maxwrite;
388		dbenv->mp_maxwrite_sleep = maxwrite_sleep;
389	}
390	return (0);
391}
392
393/*
394 * PUBLIC: int __memp_get_mp_mmapsize __P((DB_ENV *, size_t *));
395 */
396int
397__memp_get_mp_mmapsize(dbenv, mp_mmapsizep)
398	DB_ENV *dbenv;
399	size_t *mp_mmapsizep;
400{
401	DB_MPOOL *dbmp;
402	DB_THREAD_INFO *ip;
403	ENV *env;
404	MPOOL *mp;
405
406	env = dbenv->env;
407
408	ENV_NOT_CONFIGURED(env,
409	    env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
410
411	if (MPOOL_ON(env)) {
412		dbmp = env->mp_handle;
413		mp = dbmp->reginfo[0].primary;
414		ENV_ENTER(env, ip);
415		MPOOL_SYSTEM_LOCK(env);
416		*mp_mmapsizep = mp->mp_mmapsize;
417		MPOOL_SYSTEM_UNLOCK(env);
418		ENV_LEAVE(env, ip);
419	} else
420		*mp_mmapsizep = dbenv->mp_mmapsize;
421	return (0);
422}
423
424/*
425 * __memp_set_mp_mmapsize --
426 *	DB_ENV->set_mp_mmapsize.
427 *
428 * PUBLIC: int __memp_set_mp_mmapsize __P((DB_ENV *, size_t));
429 */
430int
431__memp_set_mp_mmapsize(dbenv, mp_mmapsize)
432	DB_ENV *dbenv;
433	size_t mp_mmapsize;
434{
435	DB_MPOOL *dbmp;
436	DB_THREAD_INFO *ip;
437	ENV *env;
438	MPOOL *mp;
439
440	env = dbenv->env;
441
442	ENV_NOT_CONFIGURED(env,
443	    env->mp_handle, "DB_ENV->set_mp_max_mmapsize", DB_INIT_MPOOL);
444
445	if (MPOOL_ON(env)) {
446		dbmp = env->mp_handle;
447		mp = dbmp->reginfo[0].primary;
448		ENV_ENTER(env, ip);
449		MPOOL_SYSTEM_LOCK(env);
450		mp->mp_mmapsize = mp_mmapsize;
451		MPOOL_SYSTEM_UNLOCK(env);
452		ENV_LEAVE(env, ip);
453	} else
454		dbenv->mp_mmapsize = mp_mmapsize;
455	return (0);
456}
457
458/*
459 * PUBLIC: int __memp_get_mp_pagesize __P((DB_ENV *, u_int32_t *));
460 */
461int
462__memp_get_mp_pagesize(dbenv, mp_pagesizep)
463	DB_ENV *dbenv;
464	u_int32_t *mp_pagesizep;
465{
466	ENV *env;
467
468	env = dbenv->env;
469
470	ENV_NOT_CONFIGURED(env,
471	    env->mp_handle, "DB_ENV->get_mp_max_pagesize", DB_INIT_MPOOL);
472
473	*mp_pagesizep = dbenv->mp_pagesize;
474	return (0);
475}
476
477/*
478 * __memp_set_mp_pagesize --
479 *	DB_ENV->set_mp_pagesize.
480 *
481 * PUBLIC: int __memp_set_mp_pagesize __P((DB_ENV *, u_int32_t));
482 */
483int
484__memp_set_mp_pagesize(dbenv, mp_pagesize)
485	DB_ENV *dbenv;
486	u_int32_t mp_pagesize;
487{
488	ENV *env;
489
490	env = dbenv->env;
491
492	ENV_NOT_CONFIGURED(env,
493	    env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
494	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_pagesize");
495
496	dbenv->mp_pagesize = mp_pagesize;
497	return (0);
498}
499
500/*
501 * PUBLIC: int __memp_get_mp_tablesize __P((DB_ENV *, u_int32_t *));
502 */
503int
504__memp_get_mp_tablesize(dbenv, mp_tablesizep)
505	DB_ENV *dbenv;
506	u_int32_t *mp_tablesizep;
507{
508	ENV *env;
509
510	env = dbenv->env;
511
512	ENV_NOT_CONFIGURED(env,
513	    env->mp_handle, "DB_ENV->get_mp_max_tablesize", DB_INIT_MPOOL);
514
515	*mp_tablesizep = dbenv->mp_tablesize;
516	return (0);
517}
518
519/*
520 * __memp_set_mp_tablesize --
521 *	DB_ENV->set_mp_tablesize.
522 *
523 * PUBLIC: int __memp_set_mp_tablesize __P((DB_ENV *, u_int32_t));
524 */
525int
526__memp_set_mp_tablesize(dbenv, mp_tablesize)
527	DB_ENV *dbenv;
528	u_int32_t mp_tablesize;
529{
530	ENV *env;
531
532	env = dbenv->env;
533
534	ENV_NOT_CONFIGURED(env,
535	    env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
536	ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_tablesize");
537
538	dbenv->mp_tablesize = mp_tablesize;
539	return (0);
540}
541
542/*
543 * __memp_nameop
544 *	Remove or rename a file in the pool.
545 *
546 * PUBLIC: int __memp_nameop __P((ENV *,
547 * PUBLIC:     u_int8_t *, const char *, const char *, const char *, int));
548 *
549 * XXX
550 * Undocumented interface: DB private.
551 */
552int
553__memp_nameop(env, fileid, newname, fullold, fullnew, inmem)
554	ENV *env;
555	u_int8_t *fileid;
556	const char *newname, *fullold, *fullnew;
557	int inmem;
558{
559	DB_MPOOL *dbmp;
560	DB_MPOOL_HASH *hp, *nhp;
561	MPOOL *mp;
562	MPOOLFILE *mfp;
563	roff_t newname_off;
564	u_int32_t bucket;
565	int locked, ret;
566	size_t nlen;
567	void *p;
568
569#undef	op_is_remove
570#define	op_is_remove	(newname == NULL)
571
572	COMPQUIET(bucket, 0);
573	COMPQUIET(hp, NULL);
574	COMPQUIET(newname_off, 0);
575	COMPQUIET(nlen, 0);
576
577	dbmp = NULL;
578	mfp = NULL;
579	nhp = NULL;
580	p = NULL;
581	locked = ret = 0;
582
583	if (!MPOOL_ON(env))
584		goto fsop;
585
586	dbmp = env->mp_handle;
587	mp = dbmp->reginfo[0].primary;
588	hp = R_ADDR(dbmp->reginfo, mp->ftab);
589
590	if (!op_is_remove) {
591		nlen = strlen(newname);
592		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
593		    NULL,  nlen + 1, &newname_off, &p)) != 0)
594			return (ret);
595		memcpy(p, newname, nlen + 1);
596	}
597
598	/*
599	 * Remove or rename a file that the mpool might know about.  We assume
600	 * that the fop layer has the file locked for exclusive access, so we
601	 * don't worry about locking except for the mpool mutexes.  Checkpoint
602	 * can happen at any time, independent of file locking, so we have to
603	 * do the actual unlink or rename system call while holding
604	 * all affected buckets locked.
605	 *
606	 * If this is a rename and this is a memory file then we need
607	 * to make sure that the new name does not exist.  Since we
608	 * are locking two buckets lock them in ascending order.
609	 */
610	if (inmem) {
611		DB_ASSERT(env, fullold != NULL);
612		hp += FNBUCKET(fullold, strlen(fullold));
613		if (!op_is_remove) {
614			bucket = FNBUCKET(newname, nlen);
615			nhp = R_ADDR(dbmp->reginfo, mp->ftab);
616			nhp += bucket;
617		}
618	} else
619		hp += FNBUCKET(fileid, DB_FILE_ID_LEN);
620
621	if (nhp != NULL && nhp < hp)
622		MUTEX_LOCK(env, nhp->mtx_hash);
623	MUTEX_LOCK(env, hp->mtx_hash);
624	if (nhp != NULL && nhp > hp)
625		MUTEX_LOCK(env, nhp->mtx_hash);
626	locked = 1;
627
628	if (!op_is_remove && inmem) {
629		SH_TAILQ_FOREACH(mfp, &nhp->hash_bucket, q, __mpoolfile)
630			if (!mfp->deadfile &&
631			    mfp->no_backing_file && strcmp(newname,
632			    R_ADDR(dbmp->reginfo, mfp->path_off)) == 0)
633				break;
634		if (mfp != NULL) {
635			ret = EEXIST;
636			goto err;
637		}
638	}
639
640	/*
641	 * Find the file -- if mpool doesn't know about this file, that may
642	 * not be an error.
643	 */
644	SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
645		/* Ignore non-active files. */
646		if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
647			continue;
648
649		/* Try to match on fileid. */
650		if (memcmp(fileid, R_ADDR(
651		    dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0)
652			continue;
653
654		break;
655	}
656
657	if (mfp == NULL) {
658		if (inmem) {
659			ret = ENOENT;
660			goto err;
661		}
662		goto fsop;
663	}
664
665	if (op_is_remove) {
666		MUTEX_LOCK(env, mfp->mutex);
667		/*
668		 * In-memory dbs have an artificially incremented ref count so
669		 * they do not get reclaimed as long as they exist.  Since we
670		 * are now deleting the database, we need to dec that count.
671		 */
672		if (mfp->no_backing_file)
673			mfp->mpf_cnt--;
674		mfp->deadfile = 1;
675		MUTEX_UNLOCK(env, mfp->mutex);
676	} else {
677		/*
678		 * Else, it's a rename.  We've allocated memory for the new
679		 * name.  Swap it with the old one.  If it's in memory we
680		 * need to move it the right bucket.
681		 */
682		p = R_ADDR(dbmp->reginfo, mfp->path_off);
683		mfp->path_off = newname_off;
684
685		if (inmem && hp != nhp) {
686			DB_ASSERT(env, nhp != NULL);
687			SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile);
688			mfp->bucket = bucket;
689			SH_TAILQ_INSERT_TAIL(&nhp->hash_bucket, mfp, q);
690		}
691	}
692
693fsop:	/*
694	 * If this is a real file, then mfp could be NULL, because
695	 * mpool isn't turned on, and we still need to do the file ops.
696	 */
697	if (mfp == NULL || !mfp->no_backing_file) {
698		if (op_is_remove) {
699			/*
700			 * !!!
701			 * Replication may ask us to unlink a file that's been
702			 * renamed.  Don't complain if it doesn't exist.
703			 */
704			if ((ret = __os_unlink(env, fullold, 0)) == ENOENT)
705				ret = 0;
706		} else {
707			/*
708			 * Defensive only, fullnew should never be
709			 * NULL.
710			 */
711			DB_ASSERT(env, fullnew != NULL);
712			if (fullnew == NULL) {
713				ret = EINVAL;
714				goto err;
715			}
716			ret = __os_rename(env, fullold, fullnew, 1);
717		}
718	}
719
720	/* Delete the memory we no longer need. */
721err:	if (p != NULL) {
722		MPOOL_REGION_LOCK(env, &dbmp->reginfo[0]);
723		__memp_free(&dbmp->reginfo[0], p);
724		MPOOL_REGION_UNLOCK(env, &dbmp->reginfo[0]);
725	}
726
727	/* If we have buckets locked, unlock them when done moving files. */
728	if (locked == 1) {
729		MUTEX_UNLOCK(env, hp->mtx_hash);
730		if (nhp != NULL && nhp != hp)
731			MUTEX_UNLOCK(env, nhp->mtx_hash);
732	}
733	return (ret);
734}
735
736/*
737 * __memp_ftruncate __
738 *	Truncate the file.
739 *
740 * PUBLIC: int __memp_ftruncate __P((DB_MPOOLFILE *, DB_TXN *,
741 * PUBLIC:     DB_THREAD_INFO *, db_pgno_t, u_int32_t));
742 */
743int
744__memp_ftruncate(dbmfp, txn, ip, pgno, flags)
745	DB_MPOOLFILE *dbmfp;
746	DB_TXN *txn;
747	DB_THREAD_INFO *ip;
748	db_pgno_t pgno;
749	u_int32_t flags;
750{
751	ENV *env;
752	MPOOLFILE *mfp;
753	void *pagep;
754	db_pgno_t last_pgno, pg;
755	int ret;
756
757	env = dbmfp->env;
758	mfp = dbmfp->mfp;
759	ret = 0;
760
761	MUTEX_LOCK(env, mfp->mutex);
762	last_pgno = mfp->last_pgno;
763	MUTEX_UNLOCK(env, mfp->mutex);
764
765	if (pgno > last_pgno) {
766		if (LF_ISSET(MP_TRUNC_RECOVER))
767			return (0);
768		__db_errx(env, "Truncate beyond the end of file");
769		return (EINVAL);
770	}
771
772	pg = pgno;
773	do {
774		if (mfp->block_cnt == 0)
775			break;
776		if ((ret = __memp_fget(dbmfp, &pg,
777		    ip, txn, DB_MPOOL_FREE, &pagep)) != 0)
778			return (ret);
779	} while (pg++ < last_pgno);
780
781	/*
782	 * If we are aborting an extend of a file, the call to __os_truncate
783	 * could extend the file if the new page(s) had not yet been
784	 * written to disk.  We do not want to extend the file to pages
785	 * whose log records are not yet flushed [#14031].  In addition if
786	 * we are out of disk space we can generate an error [#12743].
787	 */
788	MUTEX_LOCK(env, mfp->mutex);
789	if (!F_ISSET(mfp, MP_TEMP) &&
790	    !mfp->no_backing_file && pgno <= mfp->last_flushed_pgno)
791#ifdef HAVE_FTRUNCATE
792		ret = __os_truncate(env,
793		    dbmfp->fhp, pgno, mfp->stat.st_pagesize);
794#else
795		ret = __db_zero_extend(env,
796		    dbmfp->fhp, pgno, mfp->last_pgno, mfp->stat.st_pagesize);
797#endif
798
799	/*
800	 * This set could race with another thread of control that extending
801	 * the file.  It's not a problem because we should have the page
802	 * locked at a higher level of the system.
803	 */
804	if (ret == 0) {
805		mfp->last_pgno = pgno - 1;
806		if (mfp->last_flushed_pgno > mfp->last_pgno)
807			mfp->last_flushed_pgno = mfp->last_pgno;
808	}
809	MUTEX_UNLOCK(env, mfp->mutex);
810
811	return (ret);
812}
813
814#ifdef HAVE_FTRUNCATE
815/*
816 * Support routines for maintaining a sorted freelist while we try to rearrange
817 * and truncate the file.
818 */
819
820/*
821 * __memp_alloc_freelist --
822 *	Allocate mpool space for the freelist.
823 *
824 * PUBLIC: int __memp_alloc_freelist __P((DB_MPOOLFILE *,
825 * PUBLIC:	 u_int32_t, db_pgno_t **));
826 */
827int
828__memp_alloc_freelist(dbmfp, nelems, listp)
829	DB_MPOOLFILE *dbmfp;
830	u_int32_t nelems;
831	db_pgno_t **listp;
832{
833	DB_MPOOL *dbmp;
834	ENV *env;
835	MPOOLFILE *mfp;
836	void *retp;
837	int ret;
838
839	env = dbmfp->env;
840	dbmp = env->mp_handle;
841	mfp = dbmfp->mfp;
842
843	*listp = NULL;
844
845	/*
846	 * These fields are protected because the database layer
847	 * has the metapage locked while manipulating them.
848	 */
849	mfp->free_ref++;
850	if (mfp->free_size != 0)
851		return (EBUSY);
852
853	/* Allocate at least a few slots. */
854	mfp->free_cnt = nelems;
855	if (nelems == 0)
856		nelems = 50;
857
858	if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
859	    NULL, nelems * sizeof(db_pgno_t), &mfp->free_list, &retp)) != 0)
860		return (ret);
861
862	mfp->free_size = nelems * sizeof(db_pgno_t);
863	*listp = retp;
864	return (0);
865}
866
867/*
868 * __memp_free_freelist --
869 *	Free the list.
870 *
871 * PUBLIC: int __memp_free_freelist __P((DB_MPOOLFILE *));
872 */
873int
874__memp_free_freelist(dbmfp)
875	DB_MPOOLFILE *dbmfp;
876{
877	DB_MPOOL *dbmp;
878	ENV *env;
879	MPOOLFILE *mfp;
880
881	env = dbmfp->env;
882	dbmp = env->mp_handle;
883	mfp = dbmfp->mfp;
884
885	DB_ASSERT(env, mfp->free_ref > 0);
886	if (--mfp->free_ref > 0)
887		return (0);
888
889	DB_ASSERT(env, mfp->free_size != 0);
890
891	MPOOL_SYSTEM_LOCK(env);
892	__memp_free(dbmp->reginfo, R_ADDR(dbmp->reginfo, mfp->free_list));
893	MPOOL_SYSTEM_UNLOCK(env);
894
895	mfp->free_cnt = 0;
896	mfp->free_list = 0;
897	mfp->free_size = 0;
898	return (0);
899}
900
901/*
902 * __memp_get_freelst --
903 *	Return current list.
904 *
905 * PUBLIC: int __memp_get_freelist __P((
906 * PUBLIC:	DB_MPOOLFILE *, u_int32_t *, db_pgno_t **));
907 */
908int
909__memp_get_freelist(dbmfp, nelemp, listp)
910	DB_MPOOLFILE *dbmfp;
911	u_int32_t *nelemp;
912	db_pgno_t **listp;
913{
914	DB_MPOOL *dbmp;
915	ENV *env;
916	MPOOLFILE *mfp;
917
918	env = dbmfp->env;
919	dbmp = env->mp_handle;
920	mfp = dbmfp->mfp;
921
922	if (mfp->free_size == 0) {
923		*nelemp = 0;
924		*listp = NULL;
925	} else {
926		*nelemp = mfp->free_cnt;
927		*listp = R_ADDR(dbmp->reginfo, mfp->free_list);
928	}
929
930	return (0);
931}
932
933/*
934 * __memp_extend_freelist --
935 *	Extend the list.
936 *
937 * PUBLIC: int __memp_extend_freelist __P((
938 * PUBLIC:	DB_MPOOLFILE *, u_int32_t , db_pgno_t **));
939 */
940int
941__memp_extend_freelist(dbmfp, count, listp)
942	DB_MPOOLFILE *dbmfp;
943	u_int32_t count;
944	db_pgno_t **listp;
945{
946	DB_MPOOL *dbmp;
947	ENV *env;
948	MPOOLFILE *mfp;
949	int ret;
950	void *retp;
951
952	env = dbmfp->env;
953	dbmp = env->mp_handle;
954	mfp = dbmfp->mfp;
955
956	if (mfp->free_size == 0)
957		return (EINVAL);
958
959	if (count * sizeof(db_pgno_t) > mfp->free_size) {
960		mfp->free_size =
961		     (size_t)DB_ALIGN(count * sizeof(db_pgno_t), 512);
962		*listp = R_ADDR(dbmp->reginfo, mfp->free_list);
963		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
964		    NULL, mfp->free_size, &mfp->free_list, &retp)) != 0)
965			return (ret);
966
967		memcpy(retp, *listp, mfp->free_cnt * sizeof(db_pgno_t));
968
969		MPOOL_SYSTEM_LOCK(env);
970		__memp_free(dbmp->reginfo, *listp);
971		MPOOL_SYSTEM_UNLOCK(env);
972	}
973
974	mfp->free_cnt = count;
975	*listp = R_ADDR(dbmp->reginfo, mfp->free_list);
976
977	return (0);
978}
979#endif
980
981/*
982 * __memp_set_last_pgno -- set the last page of the file
983 *
984 * PUBLIC: void __memp_set_last_pgno __P((DB_MPOOLFILE *, db_pgno_t));
985 */
986void
987__memp_set_last_pgno(dbmfp, pgno)
988	DB_MPOOLFILE *dbmfp;
989	db_pgno_t pgno;
990{
991	dbmfp->mfp->last_pgno = pgno;
992}
993