1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1995, 1996
8 *	The President and Fellows of Harvard University.  All rights reserved.
9 *
10 * This code is derived from software contributed to Berkeley by
11 * Margo Seltzer.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * $Id$
38 */
39
40#include "db_config.h"
41
42#include "db_int.h"
43#include "dbinc/log.h"
44#include "dbinc/mp.h"
45#include "dbinc/txn.h"
46
47/*
48 * __txn_checkpoint_pp --
49 *	ENV->txn_checkpoint pre/post processing.
50 *
51 * PUBLIC: int __txn_checkpoint_pp
52 * PUBLIC:     __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
53 */
54int
55__txn_checkpoint_pp(dbenv, kbytes, minutes, flags)
56	DB_ENV *dbenv;
57	u_int32_t kbytes, minutes, flags;
58{
59	DB_THREAD_INFO *ip;
60	ENV *env;
61	int ret;
62
63	env = dbenv->env;
64
65	ENV_REQUIRES_CONFIG(env,
66	    env->tx_handle, "txn_checkpoint", DB_INIT_TXN);
67
68	/*
69	 * On a replication client, all transactions are read-only; therefore,
70	 * a checkpoint is a null-op.
71	 *
72	 * We permit txn_checkpoint, instead of just rendering it illegal,
73	 * so that an application can just let a checkpoint thread continue
74	 * to operate as it gets promoted or demoted between being a
75	 * master and a client.
76	 */
77	if (IS_REP_CLIENT(env))
78		return (0);
79
80	ENV_ENTER(env, ip);
81	REPLICATION_WRAP(env,
82	    (__txn_checkpoint(env, kbytes, minutes, flags)), 0, ret);
83	ENV_LEAVE(env, ip);
84	return (ret);
85}
86
87/*
88 * __txn_checkpoint --
89 *	ENV->txn_checkpoint.
90 *
91 * PUBLIC: int __txn_checkpoint
92 * PUBLIC:	__P((ENV *, u_int32_t, u_int32_t, u_int32_t));
93 */
94int
95__txn_checkpoint(env, kbytes, minutes, flags)
96	ENV *env;
97	u_int32_t kbytes, minutes, flags;
98{
99	DB_LSN ckp_lsn, last_ckp;
100	DB_TXNMGR *mgr;
101	DB_TXNREGION *region;
102	REGENV *renv;
103	REGINFO *infop;
104	time_t last_ckp_time, now;
105	u_int32_t bytes, id, logflags, mbytes, op;
106	int ret;
107
108	ret = 0;
109
110	/*
111	 * A client will only call through here during recovery,
112	 * so just sync the Mpool and go home.  We want to be sure
113	 * that since queue meta pages are not rolled back that they
114	 * are clean in the cache prior to any transaction log
115	 * truncation due to syncup.
116	 */
117	if (IS_REP_CLIENT(env)) {
118		if (MPOOL_ON(env) &&
119		    (ret = __memp_sync(env, DB_SYNC_CHECKPOINT, NULL)) != 0) {
120			__db_err(env, ret,
121		    "txn_checkpoint: failed to flush the buffer cache");
122			return (ret);
123		}
124		return (0);
125	}
126
127	mgr = env->tx_handle;
128	region = mgr->reginfo.primary;
129	infop = env->reginfo;
130	renv = infop->primary;
131	/*
132	 * No mutex is needed as envid is read-only once it is set.
133	 */
134	id = renv->envid;
135
136	/*
137	 * The checkpoint LSN is an LSN such that all transactions begun before
138	 * it are complete.  Our first guess (corrected below based on the list
139	 * of active transactions) is the last-written LSN.
140	 */
141	if ((ret = __log_current_lsn(env, &ckp_lsn, &mbytes, &bytes)) != 0)
142		return (ret);
143
144	if (!LF_ISSET(DB_FORCE)) {
145		/* Don't checkpoint a quiescent database. */
146		if (bytes == 0 && mbytes == 0)
147			return (0);
148
149		/*
150		 * If either kbytes or minutes is non-zero, then only take the
151		 * checkpoint if more than "minutes" minutes have passed or if
152		 * more than "kbytes" of log data have been written since the
153		 * last checkpoint.
154		 */
155		if (kbytes != 0 &&
156		    mbytes * 1024 + bytes / 1024 >= (u_int32_t)kbytes)
157			goto do_ckp;
158
159		if (minutes != 0) {
160			(void)time(&now);
161
162			TXN_SYSTEM_LOCK(env);
163			last_ckp_time = region->time_ckp;
164			TXN_SYSTEM_UNLOCK(env);
165
166			if (now - last_ckp_time >= (time_t)(minutes * 60))
167				goto do_ckp;
168		}
169
170		/*
171		 * If we checked time and data and didn't go to checkpoint,
172		 * we're done.
173		 */
174		if (minutes != 0 || kbytes != 0)
175			return (0);
176	}
177
178	/*
179	 * We must single thread checkpoints otherwise the chk_lsn may get out
180	 * of order.  We need to capture the start of the earliest currently
181	 * active transaction (chk_lsn) and then flush all buffers.  While
182	 * doing this we we could then be overtaken by another checkpoint that
183	 * sees a later chk_lsn but competes first.  An archive process could
184	 * then remove a log this checkpoint depends on.
185	 */
186do_ckp:
187	MUTEX_LOCK(env, region->mtx_ckp);
188	if ((ret = __txn_getactive(env, &ckp_lsn)) != 0)
189		goto err;
190
191	/*
192	 * Checkpoints in replication groups can cause performance problems.
193	 *
194	 * As on the master, checkpoint on the replica requires the cache be
195	 * flushed.  The problem occurs when a client has dirty cache pages
196	 * to write when the checkpoint record arrives, and the client's PERM
197	 * response is necessary in order to meet the system's durability
198	 * guarantees.  In this case, the master will have to wait until the
199	 * client completes its cache flush and writes the checkpoint record
200	 * before subsequent transactions can be committed.  The delay may
201	 * cause transactions to timeout waiting on client response, which
202	 * can cause nasty ripple effects in the system's overall throughput.
203	 * [#15338]
204	 *
205	 * First, we send a start-sync record when the checkpoint starts so
206	 * clients can start flushing their cache in preparation for the
207	 * arrival of the checkpoint record.
208	 */
209	if (LOGGING_ON(env) && IS_REP_MASTER(env)) {
210#ifdef HAVE_REPLICATION_THREADS
211		/*
212		 * If repmgr is configured in the shared environment (which we
213		 * know if we have a local host address), but no send() function
214		 * configured for this process, assume we have a
215		 * replication-unaware process that wants to automatically
216		 * participate in replication (i.e., sending replication
217		 * messages to clients).
218		 */
219		if (env->rep_handle->send == NULL &&
220		    F_ISSET(env, ENV_THREAD) &&
221		    env->rep_handle->region->my_addr.host != INVALID_ROFF &&
222		    (ret = __repmgr_autostart(env)) != 0)
223			goto err;
224#endif
225		if (env->rep_handle->send != NULL)
226			(void)__rep_send_message(env, DB_EID_BROADCAST,
227			    REP_START_SYNC, &ckp_lsn, NULL, 0, 0);
228	}
229
230	/* Flush the cache. */
231	if (MPOOL_ON(env) &&
232	    (ret = __memp_sync_int(
233		env, NULL, 0, DB_SYNC_CHECKPOINT, NULL, NULL)) != 0) {
234		__db_err(env, ret,
235		    "txn_checkpoint: failed to flush the buffer cache");
236		goto err;
237	}
238
239	/*
240	 * The client won't have more dirty pages to flush from its cache than
241	 * the master did, but there may be differences between the hardware,
242	 * I/O configuration and workload on the master and the client that
243	 * can result in the client being unable to finish its cache flush as
244	 * fast as the master.  A way to avoid the problem is to pause after
245	 * the master completes its checkpoint and before the actual checkpoint
246	 * record is logged, giving the replicas additional time to finish.
247	 *
248	 * !!!
249	 * Currently turned off when testing, because it makes the test suite
250	 * take a long time to run.
251	 */
252#ifndef	CONFIG_TEST
253	if (LOGGING_ON(env) &&
254	    IS_REP_MASTER(env) && env->rep_handle->send != NULL &&
255	    !LF_ISSET(DB_CKP_INTERNAL) &&
256	    env->rep_handle->region->chkpt_delay != 0)
257		__os_yield(env, 0, env->rep_handle->region->chkpt_delay);
258#endif
259
260	/*
261	 * Because we can't be a replication client here, and because
262	 * recovery (somewhat unusually) calls txn_checkpoint and expects
263	 * it to write a log message, LOGGING_ON is the correct macro here.
264	 */
265	if (LOGGING_ON(env)) {
266		TXN_SYSTEM_LOCK(env);
267		last_ckp = region->last_ckp;
268		TXN_SYSTEM_UNLOCK(env);
269		/*
270		 * Put out records for the open files before we log
271		 * the checkpoint.  The records are certain to be at
272		 * or after ckp_lsn, but before the checkpoint record
273		 * itself, so they're sure to be included if we start
274		 * recovery from the ckp_lsn contained in this
275		 * checkpoint.
276		 */
277		logflags = DB_LOG_CHKPNT;
278		/*
279		 * If this is a normal checkpoint, log files as checkpoints.
280		 * If we are recovering, only log as DBREG_RCLOSE if
281		 * there are no prepared txns.  Otherwise, it should
282		 * stay as DBREG_CHKPNT.
283		 */
284		op = DBREG_CHKPNT;
285		if (!IS_RECOVERING(env))
286			logflags |= DB_FLUSH;
287		else if (region->stat.st_nrestores == 0)
288			op = DBREG_RCLOSE;
289		if ((ret = __dbreg_log_files(env, op)) != 0 ||
290		    (ret = __txn_ckp_log(env, NULL, &ckp_lsn, logflags,
291		    &ckp_lsn, &last_ckp, (int32_t)time(NULL), id, 0)) != 0) {
292			__db_err(env, ret,
293			    "txn_checkpoint: log failed at LSN [%ld %ld]",
294			    (long)ckp_lsn.file, (long)ckp_lsn.offset);
295			goto err;
296		}
297
298		if ((ret = __txn_updateckp(env, &ckp_lsn)) != 0)
299			goto err;
300	}
301
302err:	MUTEX_UNLOCK(env, region->mtx_ckp);
303	return (ret);
304}
305
306/*
307 * __txn_getactive --
308 *	 Find the oldest active transaction and figure out its "begin" LSN.
309 *	 This is the lowest LSN we can checkpoint, since any record written
310 *	 after it may be involved in a transaction and may therefore need
311 *	 to be undone in the case of an abort.
312 *
313 *	 We check both the file and offset for 0 since the lsn may be in
314 *	 transition.  If it is then we don't care about this txn because it
315 *	 must be starting after we set the initial value of lsnp in the caller.
316 *	 All txns must initalize their begin_lsn before writing to the log.
317 *
318 * PUBLIC: int __txn_getactive __P((ENV *, DB_LSN *));
319 */
320int
321__txn_getactive(env, lsnp)
322	ENV *env;
323	DB_LSN *lsnp;
324{
325	DB_TXNMGR *mgr;
326	DB_TXNREGION *region;
327	TXN_DETAIL *td;
328
329	mgr = env->tx_handle;
330	region = mgr->reginfo.primary;
331
332	TXN_SYSTEM_LOCK(env);
333	SH_TAILQ_FOREACH(td, &region->active_txn, links, __txn_detail)
334		if (td->begin_lsn.file != 0 &&
335		    td->begin_lsn.offset != 0 &&
336		    LOG_COMPARE(&td->begin_lsn, lsnp) < 0)
337			*lsnp = td->begin_lsn;
338	TXN_SYSTEM_UNLOCK(env);
339
340	return (0);
341}
342
343/*
344 * __txn_getckp --
345 *	Get the LSN of the last transaction checkpoint.
346 *
347 * PUBLIC: int __txn_getckp __P((ENV *, DB_LSN *));
348 */
349int
350__txn_getckp(env, lsnp)
351	ENV *env;
352	DB_LSN *lsnp;
353{
354	DB_LSN lsn;
355	DB_TXNMGR *mgr;
356	DB_TXNREGION *region;
357
358	mgr = env->tx_handle;
359	region = mgr->reginfo.primary;
360
361	TXN_SYSTEM_LOCK(env);
362	lsn = region->last_ckp;
363	TXN_SYSTEM_UNLOCK(env);
364
365	if (IS_ZERO_LSN(lsn))
366		return (DB_NOTFOUND);
367
368	*lsnp = lsn;
369	return (0);
370}
371
372/*
373 * __txn_updateckp --
374 *	Update the last_ckp field in the transaction region.  This happens
375 * at the end of a normal checkpoint and also when a replication client
376 * receives a checkpoint record.
377 *
378 * PUBLIC: int __txn_updateckp __P((ENV *, DB_LSN *));
379 */
380int
381__txn_updateckp(env, lsnp)
382	ENV *env;
383	DB_LSN *lsnp;
384{
385	DB_TXNMGR *mgr;
386	DB_TXNREGION *region;
387
388	mgr = env->tx_handle;
389	region = mgr->reginfo.primary;
390
391	/*
392	 * We want to make sure last_ckp only moves forward;  since we drop
393	 * locks above and in log_put, it's possible for two calls to
394	 * __txn_ckp_log to finish in a different order from how they were
395	 * called.
396	 */
397	TXN_SYSTEM_LOCK(env);
398	if (LOG_COMPARE(&region->last_ckp, lsnp) < 0) {
399		region->last_ckp = *lsnp;
400		(void)time(&region->time_ckp);
401	}
402	TXN_SYSTEM_UNLOCK(env);
403
404	return (0);
405}
406