1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: mp_sync.c,v 12.59 2008/01/17 13:59:12 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/log.h"
13#include "dbinc/mp.h"
14#include "dbinc/db_page.h"
15#include "dbinc/hash.h"
16
17typedef struct {
18	DB_MPOOL_HASH *track_hp;	/* Hash bucket. */
19
20	roff_t	  track_off;		/* Page file offset. */
21	db_pgno_t track_pgno;		/* Page number. */
22} BH_TRACK;
23
24static int __bhcmp __P((const void *, const void *));
25static int __memp_close_flush_files __P((ENV *, int));
26static int __memp_sync_files __P((ENV *));
27static int __memp_sync_file __P((ENV *,
28		MPOOLFILE *, void *, u_int32_t *, u_int32_t));
29
30/*
31 * __memp_walk_files --
32 * PUBLIC: int __memp_walk_files __P((ENV *, MPOOL *,
33 * PUBLIC:	int (*) __P((ENV *, MPOOLFILE *, void *,
34 * PUBLIC:	u_int32_t *, u_int32_t)), void *, u_int32_t *, u_int32_t));
35 */
36int
37__memp_walk_files(env, mp, func, arg, countp, flags)
38	ENV *env;
39	MPOOL *mp;
40	int (*func)__P((ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t));
41	void *arg;
42	u_int32_t *countp;
43	u_int32_t flags;
44{
45	DB_MPOOL *dbmp;
46	DB_MPOOL_HASH *hp;
47	MPOOLFILE *mfp;
48	int i, ret, t_ret;
49
50	dbmp = env->mp_handle;
51	ret = 0;
52
53	hp = R_ADDR(dbmp->reginfo, mp->ftab);
54	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
55		MUTEX_LOCK(env, hp->mtx_hash);
56		SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
57			if ((t_ret = func(env,
58			    mfp, arg, countp, flags)) != 0 && ret == 0)
59				ret = t_ret;
60			if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR))
61				break;
62		}
63		MUTEX_UNLOCK(env, hp->mtx_hash);
64		if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR))
65			break;
66	}
67	return (ret);
68}
69
70/*
71 * __memp_sync_pp --
72 *	ENV->memp_sync pre/post processing.
73 *
74 * PUBLIC: int __memp_sync_pp __P((DB_ENV *, DB_LSN *));
75 */
76int
77__memp_sync_pp(dbenv, lsnp)
78	DB_ENV *dbenv;
79	DB_LSN *lsnp;
80{
81	DB_THREAD_INFO *ip;
82	ENV *env;
83	int ret;
84
85	env = dbenv->env;
86
87	ENV_REQUIRES_CONFIG(env,
88	    env->mp_handle, "memp_sync", DB_INIT_MPOOL);
89
90	/*
91	 * If no LSN is provided, flush the entire cache (reasonable usage
92	 * even if there's no log subsystem configured).
93	 */
94	if (lsnp != NULL)
95		ENV_REQUIRES_CONFIG(env,
96		    env->lg_handle, "memp_sync", DB_INIT_LOG);
97
98	ENV_ENTER(env, ip);
99	REPLICATION_WRAP(env, (__memp_sync(env, DB_SYNC_CACHE, lsnp)), 0, ret);
100	ENV_LEAVE(env, ip);
101	return (ret);
102}
103
104/*
105 * __memp_sync --
106 *	ENV->memp_sync.
107 *
108 * PUBLIC: int __memp_sync __P((ENV *, u_int32_t, DB_LSN *));
109 */
110int
111__memp_sync(env, flags, lsnp)
112	ENV *env;
113	u_int32_t flags;
114	DB_LSN *lsnp;
115{
116	DB_MPOOL *dbmp;
117	MPOOL *mp;
118	int interrupted, ret;
119
120	dbmp = env->mp_handle;
121	mp = dbmp->reginfo[0].primary;
122
123	/* If we've flushed to the requested LSN, return that information. */
124	if (lsnp != NULL) {
125		MPOOL_SYSTEM_LOCK(env);
126		if (LOG_COMPARE(lsnp, &mp->lsn) <= 0) {
127			*lsnp = mp->lsn;
128
129			MPOOL_SYSTEM_UNLOCK(env);
130			return (0);
131		}
132		MPOOL_SYSTEM_UNLOCK(env);
133	}
134
135	if ((ret =
136	    __memp_sync_int(env, NULL, 0, flags, NULL, &interrupted)) != 0)
137		return (ret);
138
139	if (!interrupted && lsnp != NULL) {
140		MPOOL_SYSTEM_LOCK(env);
141		if (LOG_COMPARE(lsnp, &mp->lsn) > 0)
142			mp->lsn = *lsnp;
143		MPOOL_SYSTEM_UNLOCK(env);
144	}
145
146	return (0);
147}
148
149/*
150 * __memp_fsync_pp --
151 *	DB_MPOOLFILE->sync pre/post processing.
152 *
153 * PUBLIC: int __memp_fsync_pp __P((DB_MPOOLFILE *));
154 */
155int
156__memp_fsync_pp(dbmfp)
157	DB_MPOOLFILE *dbmfp;
158{
159	DB_THREAD_INFO *ip;
160	ENV *env;
161	int ret;
162
163	env = dbmfp->env;
164
165	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->sync");
166
167	ENV_ENTER(env, ip);
168	REPLICATION_WRAP(env, (__memp_fsync(dbmfp)), 0, ret);
169	ENV_LEAVE(env, ip);
170	return (ret);
171}
172
173/*
174 * __memp_fsync --
175 *	DB_MPOOLFILE->sync.
176 *
177 * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *));
178 */
179int
180__memp_fsync(dbmfp)
181	DB_MPOOLFILE *dbmfp;
182{
183	MPOOLFILE *mfp;
184
185	mfp = dbmfp->mfp;
186
187	/*
188	 * If this handle doesn't have a file descriptor that's open for
189	 * writing, or if the file is a temporary, or if the file hasn't
190	 * been written since it was flushed, there's no reason to proceed
191	 * further.
192	 */
193	if (F_ISSET(dbmfp, MP_READONLY))
194		return (0);
195
196	if (F_ISSET(dbmfp->mfp, MP_TEMP) || dbmfp->mfp->no_backing_file)
197		return (0);
198
199	if (mfp->file_written == 0)
200		return (0);
201
202	return (__memp_sync_int(
203	    dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL));
204}
205
206/*
207 * __mp_xxx_fh --
208 *	Return a file descriptor for DB 1.85 compatibility locking.
209 *
210 * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
211 */
212int
213__mp_xxx_fh(dbmfp, fhp)
214	DB_MPOOLFILE *dbmfp;
215	DB_FH **fhp;
216{
217	int ret;
218
219	/*
220	 * This is a truly spectacular layering violation, intended ONLY to
221	 * support compatibility for the DB 1.85 DB->fd call.
222	 *
223	 * Sync the database file to disk, creating the file as necessary.
224	 *
225	 * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3).
226	 * The MP_READONLY test isn't interesting because we will either
227	 * already have a file descriptor (we opened the database file for
228	 * reading) or we aren't readonly (we created the database which
229	 * requires write privileges).  The MP_TEMP test isn't interesting
230	 * because we want to write to the backing file regardless so that
231	 * we get a file descriptor to return.
232	 */
233	if ((*fhp = dbmfp->fhp) != NULL)
234		return (0);
235
236	if ((ret = __memp_sync_int(
237	    dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)) == 0)
238		*fhp = dbmfp->fhp;
239	return (ret);
240}
241
242/*
243 * __memp_sync_int --
244 *	Mpool sync internal function.
245 *
246 * PUBLIC: int __memp_sync_int __P((ENV *,
247 * PUBLIC:     DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *));
248 */
249int
250__memp_sync_int(env, dbmfp, trickle_max, flags, wrote_totalp, interruptedp)
251	ENV *env;
252	DB_MPOOLFILE *dbmfp;
253	u_int32_t trickle_max, flags, *wrote_totalp;
254	int *interruptedp;
255{
256	BH *bhp;
257	BH_TRACK *bharray;
258	DB_MPOOL *dbmp;
259	DB_MPOOL_HASH *hp;
260	MPOOL *c_mp, *mp;
261	MPOOLFILE *mfp;
262	db_mutex_t mutex;
263	roff_t last_mf_offset;
264	u_int32_t ar_cnt, ar_max, dirty, i, n_cache, remaining, wrote_total;
265	int filecnt, maxopenfd, pass, required_write, ret, t_ret;
266	int wait_cnt, wrote_cnt;
267
268	dbmp = env->mp_handle;
269	mp = dbmp->reginfo[0].primary;
270	last_mf_offset = INVALID_ROFF;
271	filecnt = pass = wrote_total = 0;
272
273	if (wrote_totalp != NULL)
274		*wrote_totalp = 0;
275	if (interruptedp != NULL)
276		*interruptedp = 0;
277
278	/*
279	 * If we're flushing the cache, it's a checkpoint or we're flushing a
280	 * specific file, we really have to write the blocks and we have to
281	 * confirm they made it to disk.  Otherwise, we can skip a block if
282	 * it's hard to get.
283	 */
284	required_write = LF_ISSET(DB_SYNC_CACHE |
285	    DB_SYNC_CHECKPOINT | DB_SYNC_FILE | DB_SYNC_QUEUE_EXTENT);
286
287	/* Get shared configuration information. */
288	MPOOL_SYSTEM_LOCK(env);
289	maxopenfd = mp->mp_maxopenfd;
290	MPOOL_SYSTEM_UNLOCK(env);
291
292	/* Assume one dirty page per bucket. */
293	ar_max = mp->nreg * mp->htab_buckets;
294	if ((ret =
295	    __os_malloc(env, ar_max * sizeof(BH_TRACK), &bharray)) != 0)
296		return (ret);
297
298	/*
299	 * Walk each cache's list of buffers and mark all dirty buffers to be
300	 * written and all dirty buffers to be potentially written, depending
301	 * on our flags.
302	 */
303	for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) {
304		c_mp = dbmp->reginfo[n_cache].primary;
305
306		hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
307		for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
308			/*
309			 * We can check for empty buckets before locking as
310			 * we only care if the pointer is zero or non-zero.
311			 * We can ignore empty or clean buckets because we
312			 * only need write buffers that were dirty before
313			 * we started.
314			 */
315#ifdef DIAGNOSTIC
316			if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
317#else
318			if (hp->hash_page_dirty == 0)
319#endif
320				continue;
321
322			dirty = 0;
323			MUTEX_LOCK(env, hp->mtx_hash);
324			SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
325				/* Always ignore clean pages. */
326				if (!F_ISSET(bhp, BH_DIRTY))
327					continue;
328
329				dirty++;
330				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
331
332				/*
333				 * Ignore in-memory files, unless the file is
334				 * specifically being flushed.
335				 */
336				if (mfp->no_backing_file)
337					continue;
338				if (!LF_ISSET(DB_SYNC_FILE) &&
339				    F_ISSET(mfp, MP_TEMP))
340					continue;
341
342				/*
343				 * Ignore files that aren't involved in DB's
344				 * transactional operations during checkpoints.
345				 */
346				if (LF_ISSET(DB_SYNC_CHECKPOINT) &&
347				    mfp->lsn_off == DB_LSN_OFF_NOTSET)
348					continue;
349
350				/*
351				 * Ignore files that aren't Queue extent files
352				 * if we're flushing a Queue file with extents.
353				 */
354				if (LF_ISSET(DB_SYNC_QUEUE_EXTENT) &&
355				    !F_ISSET(mfp, MP_EXTENT))
356					continue;
357
358				/*
359				 * If we're flushing a specific file, see if
360				 * this page is from that file.
361				 */
362				if (dbmfp != NULL && mfp != dbmfp->mfp)
363					continue;
364
365				/* Track the buffer, we want it. */
366				bharray[ar_cnt].track_hp = hp;
367				bharray[ar_cnt].track_pgno = bhp->pgno;
368				bharray[ar_cnt].track_off = bhp->mf_offset;
369				ar_cnt++;
370
371				/*
372				 * If we run out of space, double and continue.
373				 * Don't stop at trickle_max, we want to sort
374				 * as large a sample set as possible in order
375				 * to minimize disk seeks.
376				 */
377				if (ar_cnt >= ar_max) {
378					if ((ret = __os_realloc(env,
379					    (ar_max * 2) * sizeof(BH_TRACK),
380					    &bharray)) != 0)
381						break;
382					ar_max *= 2;
383				}
384			}
385			DB_ASSERT(env, dirty == hp->hash_page_dirty);
386			if (dirty != hp->hash_page_dirty) {
387				__db_errx(env,
388				    "memp_sync: correcting dirty count %lu %lu",
389				    (u_long)hp->hash_page_dirty, (u_long)dirty);
390				hp->hash_page_dirty = dirty;
391			}
392			MUTEX_UNLOCK(env, hp->mtx_hash);
393
394			if (ret != 0)
395				goto err;
396
397			/* Check if the call has been interrupted. */
398			if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET(
399			    mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) {
400				if (interruptedp != NULL)
401					*interruptedp = 1;
402				goto err;
403			}
404		}
405	}
406
407	/* If there no buffers to write, we're done. */
408	if (ar_cnt == 0)
409		goto done;
410
411	/*
412	 * Write the buffers in file/page order, trying to reduce seeks by the
413	 * filesystem and, when pages are smaller than filesystem block sizes,
414	 * reduce the actual number of writes.
415	 */
416	if (ar_cnt > 1)
417		qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp);
418
419	/*
420	 * If we're trickling buffers, only write enough to reach the correct
421	 * percentage.
422	 */
423	if (LF_ISSET(DB_SYNC_TRICKLE) && ar_cnt > trickle_max)
424		ar_cnt = trickle_max;
425
426	/*
427	 * Flush the log.  We have to ensure the log records reflecting the
428	 * changes on the database pages we're writing have already made it
429	 * to disk.  We still have to check the log each time we write a page
430	 * (because pages we are about to write may be modified after we have
431	 * flushed the log), but in general this will at least avoid any I/O
432	 * on the log's part.
433	 */
434	if (LOGGING_ON(env) && (ret = __log_flush(env, NULL)) != 0)
435		goto err;
436
437	/*
438	 * Walk the array, writing buffers.  When we write a buffer, we NULL
439	 * out its hash bucket pointer so we don't process a slot more than
440	 * once.
441	 */
442	for (i = pass = wrote_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) {
443		if (i >= ar_cnt) {
444			i = 0;
445			++pass;
446			__os_yield(env, 1, 0);
447		}
448		if ((hp = bharray[i].track_hp) == NULL)
449			continue;
450
451		/* Lock the hash bucket and find the buffer. */
452		mutex = hp->mtx_hash;
453		MUTEX_LOCK(env, mutex);
454		SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh)
455			if (bhp->pgno == bharray[i].track_pgno &&
456			    bhp->mf_offset == bharray[i].track_off)
457				break;
458
459		/*
460		 * If we can't find the buffer we're done, somebody else had
461		 * to have written it.
462		 *
463		 * If the buffer isn't dirty, we're done, there's no work
464		 * needed.
465		 */
466		if (bhp == NULL || !F_ISSET(bhp, BH_DIRTY)) {
467			MUTEX_UNLOCK(env, mutex);
468			--remaining;
469			bharray[i].track_hp = NULL;
470			continue;
471		}
472
473		/*
474		 * If the buffer is locked by another thread, ignore it, we'll
475		 * come back to it.
476		 *
477		 * If the buffer is pinned and it's only the first or second
478		 * time we have looked at it, ignore it, we'll come back to
479		 * it.
480		 *
481		 * In either case, skip the buffer if we're not required to
482		 * write it.
483		 */
484		if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) {
485			MUTEX_UNLOCK(env, mutex);
486			if (!required_write) {
487				--remaining;
488				bharray[i].track_hp = NULL;
489			}
490			continue;
491		}
492
493		/* Pin the buffer into memory and lock it. */
494		++bhp->ref;
495		F_SET(bhp, BH_LOCKED);
496
497		/*
498		 * If the buffer is referenced by another thread, set the sync
499		 * wait-for count (used to count down outstanding references to
500		 * this buffer as they are returned to the cache), then unlock
501		 * the hash bucket and wait for the count to go to 0.   No other
502		 * thread can acquire the buffer because we have it locked.
503		 *
504		 * If a thread attempts to re-pin a page, the wait-for count
505		 * will never go to 0 (that thread spins on our buffer lock,
506		 * while we spin on the thread's ref count).  Give up if we
507		 * don't get the buffer in 3 seconds, we'll try again later.
508		 *
509		 * If, when the wait-for count goes to 0, the buffer is found
510		 * to be dirty, write it.
511		 */
512		bhp->ref_sync = bhp->ref - 1;
513		if (bhp->ref_sync != 0) {
514			MUTEX_UNLOCK(env, mutex);
515			for (wait_cnt = 1;
516			    bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
517				__os_yield(env, 1, 0);
518			MUTEX_LOCK(env, mutex);
519		}
520
521		/*
522		 * If we've switched files, check to see if we're configured
523		 * to close file descriptors.
524		 */
525		if (maxopenfd != 0 && bhp->mf_offset != last_mf_offset) {
526			if (++filecnt >= maxopenfd) {
527				filecnt = 0;
528				if ((t_ret = __memp_close_flush_files(
529				    env, 1)) != 0 && ret == 0)
530					ret = t_ret;
531			}
532			last_mf_offset = bhp->mf_offset;
533		}
534
535		/*
536		 * If the ref_sync count has gone to 0, we're going to be done
537		 * with this buffer no matter what happens.
538		 */
539		if (bhp->ref_sync == 0) {
540			--remaining;
541			bharray[i].track_hp = NULL;
542		}
543
544		/*
545		 * If the ref_sync count has gone to 0 and the buffer is still
546		 * dirty, we write it.  We only try to write the buffer once.
547		 */
548		if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) {
549			mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
550			if ((t_ret =
551			    __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) {
552				++wrote_cnt;
553				++wrote_total;
554			} else {
555				if (ret == 0)
556					ret = t_ret;
557				__db_errx
558				    (env, "%s: unable to flush page: %lu",
559				    __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
560
561			}
562		}
563
564		/*
565		 * If ref_sync count never went to 0, the buffer was written
566		 * by another thread, or the write failed, we still have the
567		 * buffer locked.
568		 */
569		if (F_ISSET(bhp, BH_LOCKED))
570			F_CLR(bhp, BH_LOCKED);
571
572		/*
573		 * Reset the ref_sync count regardless of our success, we're
574		 * done with this buffer for now.
575		 */
576		bhp->ref_sync = 0;
577
578		/* Discard our buffer reference. */
579		--bhp->ref;
580
581		/*
582		 * If a thread of control is waiting in this hash bucket, wake
583		 * it up.
584		 */
585		if (F_ISSET(hp, IO_WAITER)) {
586			F_CLR(hp, IO_WAITER);
587			MUTEX_UNLOCK(env, hp->mtx_io);
588		}
589
590		/* Release the hash bucket mutex. */
591		MUTEX_UNLOCK(env, mutex);
592
593		/* Check if the call has been interrupted. */
594		if (LF_ISSET(DB_SYNC_INTERRUPT_OK) &&
595		    FLD_ISSET(mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) {
596			if (interruptedp != NULL)
597				*interruptedp = 1;
598			goto err;
599		}
600
601		/*
602		 * Sleep after some number of writes to avoid disk saturation.
603		 * Don't cache the max writes value, an application shutting
604		 * down might reset the value in order to do a fast flush or
605		 * checkpoint.
606		 */
607		if (!LF_ISSET(DB_SYNC_SUPPRESS_WRITE) &&
608		    !FLD_ISSET(mp->config_flags, DB_MEMP_SUPPRESS_WRITE) &&
609		    mp->mp_maxwrite != 0 && wrote_cnt >= mp->mp_maxwrite) {
610			wrote_cnt = 0;
611			__os_yield(env, 0, (u_long)mp->mp_maxwrite_sleep);
612		}
613	}
614
615done:	/*
616	 * If a write is required, we have to force the pages to disk.  We
617	 * don't do this as we go along because we want to give the OS as
618	 * much time as possible to lazily flush, and because we have to flush
619	 * files that might not even have had dirty buffers in the cache, so
620	 * we have to walk the files list.
621	 */
622	if (ret == 0 && required_write) {
623		if (dbmfp == NULL)
624			ret = __memp_sync_files(env);
625		else
626			ret = __os_fsync(env, dbmfp->fhp);
627	}
628
629	/* If we've opened files to flush pages, close them. */
630	if ((t_ret = __memp_close_flush_files(env, 0)) != 0 && ret == 0)
631		ret = t_ret;
632
633err:	__os_free(env, bharray);
634	if (wrote_totalp != NULL)
635		*wrote_totalp = wrote_total;
636
637	return (ret);
638}
639
640static int
641__memp_sync_file(env, mfp, argp, countp, flags)
642	ENV *env;
643	MPOOLFILE *mfp;
644	void *argp;
645	u_int32_t *countp;
646	u_int32_t flags;
647{
648	DB_MPOOL *dbmp;
649	DB_MPOOLFILE *dbmfp;
650	int ret, t_ret;
651
652	COMPQUIET(countp, NULL);
653	COMPQUIET(flags, 0);
654
655	if (!mfp->file_written || mfp->no_backing_file ||
656	    mfp->deadfile || F_ISSET(mfp, MP_TEMP))
657		return (0);
658	/*
659	 * Pin the MPOOLFILE structure into memory, and release the
660	 * region mutex allowing us to walk the linked list.  We'll
661	 * re-acquire that mutex to move to the next entry in the list.
662	 *
663	 * This works because we only need to flush current entries,
664	 * we don't care about new entries being added, and the linked
665	 * list is never re-ordered, a single pass is sufficient.  It
666	 * requires MPOOLFILE structures removed before we get to them
667	 * be flushed to disk, but that's nothing new, they could have
668	 * been removed while checkpoint was running, too.
669	 *
670	 * Once we have the MPOOLFILE lock, re-check the MPOOLFILE is
671	 * not being discarded.  (A thread removing the MPOOLFILE
672	 * will: hold the MPOOLFILE mutex, set deadfile, drop the
673	 * MPOOLFILE mutex and then acquire the region MUTEX to walk
674	 * the linked list and remove the MPOOLFILE structure.  Make
675	 * sure the MPOOLFILE wasn't marked dead while we waited for
676	 * the mutex.
677	 */
678	MUTEX_LOCK(env, mfp->mutex);
679	if (!mfp->file_written || mfp->deadfile) {
680		MUTEX_UNLOCK(env, mfp->mutex);
681		return (0);
682	}
683	++mfp->mpf_cnt;
684	MUTEX_UNLOCK(env, mfp->mutex);
685
686	/*
687	 * Look for an already open, writeable handle (fsync doesn't
688	 * work on read-only Windows handles).
689	 */
690	dbmp = env->mp_handle;
691	MUTEX_LOCK(env, dbmp->mutex);
692	TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) {
693		if (dbmfp->mfp != mfp || F_ISSET(dbmfp, MP_READONLY))
694			continue;
695		/*
696		 * We don't want to hold the mutex while calling sync.
697		 * Increment the DB_MPOOLFILE handle ref count to pin
698		 * it into memory.
699		 */
700		++dbmfp->ref;
701		break;
702	}
703	MUTEX_UNLOCK(env, dbmp->mutex);
704
705	/* If we don't find a handle we can use, open one. */
706	if (dbmfp == NULL) {
707		if ((ret = __memp_mf_sync(dbmp, mfp, 1)) != 0) {
708			__db_err(env, ret,
709			    "%s: unable to flush", (char *)
710			    R_ADDR(dbmp->reginfo, mfp->path_off));
711		}
712	} else
713		ret = __os_fsync(env, dbmfp->fhp);
714
715	/*
716	 * Re-acquire the MPOOLFILE mutex, we need it to modify the
717	 * reference count.
718	 */
719	MUTEX_LOCK(env, mfp->mutex);
720
721	/*
722	 * If we wrote the file and there are no other references (or there
723	 * is a single reference, and it's the one we opened to write
724	 * buffers during checkpoint), clear the file_written flag.  We
725	 * do this so that applications opening thousands of files don't
726	 * loop here opening and flushing those files during checkpoint.
727	 *
728	 * The danger here is if a buffer were to be written as part of
729	 * a checkpoint, and then not be flushed to disk.  This cannot
730	 * happen because we only clear file_written when there are no
731	 * other users of the MPOOLFILE in the system, and, as we hold
732	 * the region lock, no possibility of another thread of control
733	 * racing with us to open a MPOOLFILE.
734	 */
735	if (mfp->mpf_cnt == 1 || (mfp->mpf_cnt == 2 &&
736	    dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) {
737		mfp->file_written = 0;
738
739		/*
740		 * We may be the last reference for a MPOOLFILE, as we
741		 * weren't holding the MPOOLFILE mutex when flushing
742		 * it's buffers to disk.  If we can discard it, set
743		 * a flag to schedule a clean-out pass.   (Not likely,
744		 * I mean, what are the chances that there aren't any
745		 * buffers in the pool?  Regardless, it might happen.)
746		 */
747		if (mfp->mpf_cnt == 1 && mfp->block_cnt == 0)
748			*(int *)argp = 1;
749	}
750
751	/*
752	 * If we found the file we must close it in case we are the last
753	 * reference to the dbmfp.  NOTE: since we have incremented
754	 * mfp->mpf_cnt this cannot be the last reference to the mfp.
755	 * This is important since we are called with the hash bucket
756	 * locked.  The mfp will get freed via the cleanup pass.
757	 */
758	if (dbmfp != NULL &&
759	    (t_ret = __memp_fclose(dbmfp, DB_MPOOL_NOLOCK)) != 0 && ret == 0)
760		ret = t_ret;
761
762	--mfp->mpf_cnt;
763
764	/* Unlock the MPOOLFILE. */
765	MUTEX_UNLOCK(env, mfp->mutex);
766	return (ret);
767}
768
769/*
770 * __memp_sync_files --
771 *	Sync all the files in the environment, open or not.
772 */
773static int
774__memp_sync_files(env)
775	ENV *env;
776{
777	DB_MPOOL *dbmp;
778	DB_MPOOL_HASH *hp;
779	MPOOL *mp;
780	MPOOLFILE *mfp, *next_mfp;
781	int i, need_discard_pass, ret;
782
783	dbmp = env->mp_handle;
784	mp = dbmp->reginfo[0].primary;
785	need_discard_pass = ret = 0;
786
787	ret = __memp_walk_files(env,
788	    mp, __memp_sync_file, &need_discard_pass, 0, DB_STAT_MEMP_NOERROR);
789
790	/*
791	 * We may need to do a last pass through the MPOOLFILE list -- if we
792	 * were the last reference to an MPOOLFILE, we need to clean it out.
793	 */
794	if (!need_discard_pass)
795		return (ret);
796
797	hp = R_ADDR(dbmp->reginfo, mp->ftab);
798	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
799retry:		MUTEX_LOCK(env, hp->mtx_hash);
800		for (mfp = SH_TAILQ_FIRST(&hp->hash_bucket,
801		    __mpoolfile); mfp != NULL; mfp = next_mfp) {
802			next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile);
803			/*
804			 * Do a fast check -- we can check for zero/non-zero
805			 * without a mutex on the MPOOLFILE.  If likely to
806			 * succeed, lock the MPOOLFILE down and look for real.
807			 */
808			if (mfp->deadfile ||
809			    mfp->block_cnt != 0 || mfp->mpf_cnt != 0)
810				continue;
811
812			MUTEX_LOCK(env, mfp->mutex);
813			if (!mfp->deadfile &&
814			    mfp->block_cnt == 0 && mfp->mpf_cnt == 0) {
815				MUTEX_UNLOCK(env, hp->mtx_hash);
816				(void)__memp_mf_discard(dbmp, mfp);
817				goto retry;
818			} else
819				MUTEX_UNLOCK(env, mfp->mutex);
820		}
821		MUTEX_UNLOCK(env, hp->mtx_hash);
822	}
823	return (ret);
824}
825
826/*
827 * __memp_mf_sync --
828 *	Flush an MPOOLFILE, when no currently open handle is available.
829 *
830 * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int));
831 */
832int
833__memp_mf_sync(dbmp, mfp, locked)
834	DB_MPOOL *dbmp;
835	MPOOLFILE *mfp;
836	int locked;
837{
838	DB_FH *fhp;
839	DB_MPOOL_HASH *hp;
840	ENV *env;
841	MPOOL *mp;
842	int ret, t_ret;
843	char *rpath;
844
845	COMPQUIET(hp, NULL);
846	env = dbmp->env;
847
848	/*
849	 * We need to be holding the hash lock: we're using the path name
850	 * and __memp_nameop might try and rename the file.
851	 */
852	if (!locked) {
853		mp = dbmp->reginfo[0].primary;
854		hp = R_ADDR(dbmp->reginfo, mp->ftab);
855		hp += FNBUCKET(
856		    R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN);
857		MUTEX_LOCK(env, hp->mtx_hash);
858	}
859
860	if ((ret = __db_appname(env, DB_APP_DATA,
861	    R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) {
862		if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) {
863			ret = __os_fsync(env, fhp);
864			if ((t_ret =
865			    __os_closehandle(env, fhp)) != 0 && ret == 0)
866				ret = t_ret;
867		}
868		__os_free(env, rpath);
869	}
870
871	if (!locked)
872		MUTEX_UNLOCK(env, hp->mtx_hash);
873
874	return (ret);
875}
876
877/*
878 * __memp_close_flush_files --
879 *	Close files opened only to flush buffers.
880 */
881static int
882__memp_close_flush_files(env, dosync)
883	ENV *env;
884	int dosync;
885{
886	DB_MPOOL *dbmp;
887	DB_MPOOLFILE *dbmfp;
888	MPOOLFILE *mfp;
889	int ret;
890
891	dbmp = env->mp_handle;
892
893	/*
894	 * The routine exists because we must close files opened by sync to
895	 * flush buffers.  There are two cases: first, extent files have to
896	 * be closed so they may be removed when empty.  Second, regular
897	 * files have to be closed so we don't run out of descriptors (for
898	 * example, an application partitioning its data into databases
899	 * based on timestamps, so there's a continually increasing set of
900	 * files).
901	 *
902	 * We mark files opened in the __memp_bhwrite() function with the
903	 * MP_FLUSH flag.  Here we walk through our file descriptor list,
904	 * and, if a file was opened by __memp_bhwrite(), we close it.
905	 */
906retry:	MUTEX_LOCK(env, dbmp->mutex);
907	TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q)
908		if (F_ISSET(dbmfp, MP_FLUSH)) {
909			F_CLR(dbmfp, MP_FLUSH);
910			MUTEX_UNLOCK(env, dbmp->mutex);
911			if (dosync) {
912				/*
913				 * If we have the only open handle on the file,
914				 * clear the dirty flag so we don't re-open and
915				 * sync it again when discarding the MPOOLFILE
916				 * structure.  Clear the flag before the sync
917				 * so can't race with a thread writing the file.
918				 */
919				mfp = dbmfp->mfp;
920				if (mfp->mpf_cnt == 1) {
921					MUTEX_LOCK(env, mfp->mutex);
922					if (mfp->mpf_cnt == 1)
923						mfp->file_written = 0;
924					MUTEX_UNLOCK(env, mfp->mutex);
925				}
926				if ((ret = __os_fsync(env, dbmfp->fhp)) != 0)
927					return (ret);
928			}
929			if ((ret = __memp_fclose(dbmfp, 0)) != 0)
930				return (ret);
931			goto retry;
932		}
933	MUTEX_UNLOCK(env, dbmp->mutex);
934
935	return (0);
936}
937
938static int
939__bhcmp(p1, p2)
940	const void *p1, *p2;
941{
942	BH_TRACK *bhp1, *bhp2;
943
944	bhp1 = (BH_TRACK *)p1;
945	bhp2 = (BH_TRACK *)p2;
946
947	/* Sort by file (shared memory pool offset). */
948	if (bhp1->track_off < bhp2->track_off)
949		return (-1);
950	if (bhp1->track_off > bhp2->track_off)
951		return (1);
952
953	/*
954	 * !!!
955	 * Defend against badly written quicksort code calling the comparison
956	 * function with two identical pointers (e.g., WATCOM C++ (Power++)).
957	 */
958	if (bhp1->track_pgno < bhp2->track_pgno)
959		return (-1);
960	if (bhp1->track_pgno > bhp2->track_pgno)
961		return (1);
962	return (0);
963}
964