1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996-2009 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1995, 1996 8 * The President and Fellows of Harvard University. All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * Margo Seltzer. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * $Id$ 38 */ 39 40#include "db_config.h" 41 42#include "db_int.h" 43#include "dbinc/log.h" 44#include "dbinc/mp.h" 45#include "dbinc/txn.h" 46 47/* 48 * __txn_checkpoint_pp -- 49 * ENV->txn_checkpoint pre/post processing. 50 * 51 * PUBLIC: int __txn_checkpoint_pp 52 * PUBLIC: __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t)); 53 */ 54int 55__txn_checkpoint_pp(dbenv, kbytes, minutes, flags) 56 DB_ENV *dbenv; 57 u_int32_t kbytes, minutes, flags; 58{ 59 DB_THREAD_INFO *ip; 60 ENV *env; 61 int ret; 62 63 env = dbenv->env; 64 65 ENV_REQUIRES_CONFIG(env, 66 env->tx_handle, "txn_checkpoint", DB_INIT_TXN); 67 68 /* 69 * On a replication client, all transactions are read-only; therefore, 70 * a checkpoint is a null-op. 71 * 72 * We permit txn_checkpoint, instead of just rendering it illegal, 73 * so that an application can just let a checkpoint thread continue 74 * to operate as it gets promoted or demoted between being a 75 * master and a client. 76 */ 77 if (IS_REP_CLIENT(env)) 78 return (0); 79 80 ENV_ENTER(env, ip); 81 REPLICATION_WRAP(env, 82 (__txn_checkpoint(env, kbytes, minutes, flags)), 0, ret); 83 ENV_LEAVE(env, ip); 84 return (ret); 85} 86 87/* 88 * __txn_checkpoint -- 89 * ENV->txn_checkpoint. 90 * 91 * PUBLIC: int __txn_checkpoint 92 * PUBLIC: __P((ENV *, u_int32_t, u_int32_t, u_int32_t)); 93 */ 94int 95__txn_checkpoint(env, kbytes, minutes, flags) 96 ENV *env; 97 u_int32_t kbytes, minutes, flags; 98{ 99 DB_LSN ckp_lsn, last_ckp; 100 DB_TXNMGR *mgr; 101 DB_TXNREGION *region; 102 REGENV *renv; 103 REGINFO *infop; 104 time_t last_ckp_time, now; 105 u_int32_t bytes, id, logflags, mbytes, op; 106 int ret; 107 108 ret = 0; 109 110 /* 111 * A client will only call through here during recovery, 112 * so just sync the Mpool and go home. We want to be sure 113 * that since queue meta pages are not rolled back that they 114 * are clean in the cache prior to any transaction log 115 * truncation due to syncup. 116 */ 117 if (IS_REP_CLIENT(env)) { 118 if (MPOOL_ON(env) && 119 (ret = __memp_sync(env, DB_SYNC_CHECKPOINT, NULL)) != 0) { 120 __db_err(env, ret, 121 "txn_checkpoint: failed to flush the buffer cache"); 122 return (ret); 123 } 124 return (0); 125 } 126 127 mgr = env->tx_handle; 128 region = mgr->reginfo.primary; 129 infop = env->reginfo; 130 renv = infop->primary; 131 /* 132 * No mutex is needed as envid is read-only once it is set. 133 */ 134 id = renv->envid; 135 136 /* 137 * The checkpoint LSN is an LSN such that all transactions begun before 138 * it are complete. Our first guess (corrected below based on the list 139 * of active transactions) is the last-written LSN. 140 */ 141 if ((ret = __log_current_lsn(env, &ckp_lsn, &mbytes, &bytes)) != 0) 142 return (ret); 143 144 if (!LF_ISSET(DB_FORCE)) { 145 /* Don't checkpoint a quiescent database. */ 146 if (bytes == 0 && mbytes == 0) 147 return (0); 148 149 /* 150 * If either kbytes or minutes is non-zero, then only take the 151 * checkpoint if more than "minutes" minutes have passed or if 152 * more than "kbytes" of log data have been written since the 153 * last checkpoint. 154 */ 155 if (kbytes != 0 && 156 mbytes * 1024 + bytes / 1024 >= (u_int32_t)kbytes) 157 goto do_ckp; 158 159 if (minutes != 0) { 160 (void)time(&now); 161 162 TXN_SYSTEM_LOCK(env); 163 last_ckp_time = region->time_ckp; 164 TXN_SYSTEM_UNLOCK(env); 165 166 if (now - last_ckp_time >= (time_t)(minutes * 60)) 167 goto do_ckp; 168 } 169 170 /* 171 * If we checked time and data and didn't go to checkpoint, 172 * we're done. 173 */ 174 if (minutes != 0 || kbytes != 0) 175 return (0); 176 } 177 178 /* 179 * We must single thread checkpoints otherwise the chk_lsn may get out 180 * of order. We need to capture the start of the earliest currently 181 * active transaction (chk_lsn) and then flush all buffers. While 182 * doing this we we could then be overtaken by another checkpoint that 183 * sees a later chk_lsn but competes first. An archive process could 184 * then remove a log this checkpoint depends on. 185 */ 186do_ckp: 187 MUTEX_LOCK(env, region->mtx_ckp); 188 if ((ret = __txn_getactive(env, &ckp_lsn)) != 0) 189 goto err; 190 191 /* 192 * Checkpoints in replication groups can cause performance problems. 193 * 194 * As on the master, checkpoint on the replica requires the cache be 195 * flushed. The problem occurs when a client has dirty cache pages 196 * to write when the checkpoint record arrives, and the client's PERM 197 * response is necessary in order to meet the system's durability 198 * guarantees. In this case, the master will have to wait until the 199 * client completes its cache flush and writes the checkpoint record 200 * before subsequent transactions can be committed. The delay may 201 * cause transactions to timeout waiting on client response, which 202 * can cause nasty ripple effects in the system's overall throughput. 203 * [#15338] 204 * 205 * First, we send a start-sync record when the checkpoint starts so 206 * clients can start flushing their cache in preparation for the 207 * arrival of the checkpoint record. 208 */ 209 if (LOGGING_ON(env) && IS_REP_MASTER(env)) { 210#ifdef HAVE_REPLICATION_THREADS 211 /* 212 * If repmgr is configured in the shared environment (which we 213 * know if we have a local host address), but no send() function 214 * configured for this process, assume we have a 215 * replication-unaware process that wants to automatically 216 * participate in replication (i.e., sending replication 217 * messages to clients). 218 */ 219 if (env->rep_handle->send == NULL && 220 F_ISSET(env, ENV_THREAD) && 221 env->rep_handle->region->my_addr.host != INVALID_ROFF && 222 (ret = __repmgr_autostart(env)) != 0) 223 goto err; 224#endif 225 if (env->rep_handle->send != NULL) 226 (void)__rep_send_message(env, DB_EID_BROADCAST, 227 REP_START_SYNC, &ckp_lsn, NULL, 0, 0); 228 } 229 230 /* Flush the cache. */ 231 if (MPOOL_ON(env) && 232 (ret = __memp_sync_int( 233 env, NULL, 0, DB_SYNC_CHECKPOINT, NULL, NULL)) != 0) { 234 __db_err(env, ret, 235 "txn_checkpoint: failed to flush the buffer cache"); 236 goto err; 237 } 238 239 /* 240 * The client won't have more dirty pages to flush from its cache than 241 * the master did, but there may be differences between the hardware, 242 * I/O configuration and workload on the master and the client that 243 * can result in the client being unable to finish its cache flush as 244 * fast as the master. A way to avoid the problem is to pause after 245 * the master completes its checkpoint and before the actual checkpoint 246 * record is logged, giving the replicas additional time to finish. 247 * 248 * !!! 249 * Currently turned off when testing, because it makes the test suite 250 * take a long time to run. 251 */ 252#ifndef CONFIG_TEST 253 if (LOGGING_ON(env) && 254 IS_REP_MASTER(env) && env->rep_handle->send != NULL && 255 !LF_ISSET(DB_CKP_INTERNAL) && 256 env->rep_handle->region->chkpt_delay != 0) 257 __os_yield(env, 0, env->rep_handle->region->chkpt_delay); 258#endif 259 260 /* 261 * Because we can't be a replication client here, and because 262 * recovery (somewhat unusually) calls txn_checkpoint and expects 263 * it to write a log message, LOGGING_ON is the correct macro here. 264 */ 265 if (LOGGING_ON(env)) { 266 TXN_SYSTEM_LOCK(env); 267 last_ckp = region->last_ckp; 268 TXN_SYSTEM_UNLOCK(env); 269 /* 270 * Put out records for the open files before we log 271 * the checkpoint. The records are certain to be at 272 * or after ckp_lsn, but before the checkpoint record 273 * itself, so they're sure to be included if we start 274 * recovery from the ckp_lsn contained in this 275 * checkpoint. 276 */ 277 logflags = DB_LOG_CHKPNT; 278 /* 279 * If this is a normal checkpoint, log files as checkpoints. 280 * If we are recovering, only log as DBREG_RCLOSE if 281 * there are no prepared txns. Otherwise, it should 282 * stay as DBREG_CHKPNT. 283 */ 284 op = DBREG_CHKPNT; 285 if (!IS_RECOVERING(env)) 286 logflags |= DB_FLUSH; 287 else if (region->stat.st_nrestores == 0) 288 op = DBREG_RCLOSE; 289 if ((ret = __dbreg_log_files(env, op)) != 0 || 290 (ret = __txn_ckp_log(env, NULL, &ckp_lsn, logflags, 291 &ckp_lsn, &last_ckp, (int32_t)time(NULL), id, 0)) != 0) { 292 __db_err(env, ret, 293 "txn_checkpoint: log failed at LSN [%ld %ld]", 294 (long)ckp_lsn.file, (long)ckp_lsn.offset); 295 goto err; 296 } 297 298 if ((ret = __txn_updateckp(env, &ckp_lsn)) != 0) 299 goto err; 300 } 301 302err: MUTEX_UNLOCK(env, region->mtx_ckp); 303 return (ret); 304} 305 306/* 307 * __txn_getactive -- 308 * Find the oldest active transaction and figure out its "begin" LSN. 309 * This is the lowest LSN we can checkpoint, since any record written 310 * after it may be involved in a transaction and may therefore need 311 * to be undone in the case of an abort. 312 * 313 * We check both the file and offset for 0 since the lsn may be in 314 * transition. If it is then we don't care about this txn because it 315 * must be starting after we set the initial value of lsnp in the caller. 316 * All txns must initalize their begin_lsn before writing to the log. 317 * 318 * PUBLIC: int __txn_getactive __P((ENV *, DB_LSN *)); 319 */ 320int 321__txn_getactive(env, lsnp) 322 ENV *env; 323 DB_LSN *lsnp; 324{ 325 DB_TXNMGR *mgr; 326 DB_TXNREGION *region; 327 TXN_DETAIL *td; 328 329 mgr = env->tx_handle; 330 region = mgr->reginfo.primary; 331 332 TXN_SYSTEM_LOCK(env); 333 SH_TAILQ_FOREACH(td, ®ion->active_txn, links, __txn_detail) 334 if (td->begin_lsn.file != 0 && 335 td->begin_lsn.offset != 0 && 336 LOG_COMPARE(&td->begin_lsn, lsnp) < 0) 337 *lsnp = td->begin_lsn; 338 TXN_SYSTEM_UNLOCK(env); 339 340 return (0); 341} 342 343/* 344 * __txn_getckp -- 345 * Get the LSN of the last transaction checkpoint. 346 * 347 * PUBLIC: int __txn_getckp __P((ENV *, DB_LSN *)); 348 */ 349int 350__txn_getckp(env, lsnp) 351 ENV *env; 352 DB_LSN *lsnp; 353{ 354 DB_LSN lsn; 355 DB_TXNMGR *mgr; 356 DB_TXNREGION *region; 357 358 mgr = env->tx_handle; 359 region = mgr->reginfo.primary; 360 361 TXN_SYSTEM_LOCK(env); 362 lsn = region->last_ckp; 363 TXN_SYSTEM_UNLOCK(env); 364 365 if (IS_ZERO_LSN(lsn)) 366 return (DB_NOTFOUND); 367 368 *lsnp = lsn; 369 return (0); 370} 371 372/* 373 * __txn_updateckp -- 374 * Update the last_ckp field in the transaction region. This happens 375 * at the end of a normal checkpoint and also when a replication client 376 * receives a checkpoint record. 377 * 378 * PUBLIC: int __txn_updateckp __P((ENV *, DB_LSN *)); 379 */ 380int 381__txn_updateckp(env, lsnp) 382 ENV *env; 383 DB_LSN *lsnp; 384{ 385 DB_TXNMGR *mgr; 386 DB_TXNREGION *region; 387 388 mgr = env->tx_handle; 389 region = mgr->reginfo.primary; 390 391 /* 392 * We want to make sure last_ckp only moves forward; since we drop 393 * locks above and in log_put, it's possible for two calls to 394 * __txn_ckp_log to finish in a different order from how they were 395 * called. 396 */ 397 TXN_SYSTEM_LOCK(env); 398 if (LOG_COMPARE(®ion->last_ckp, lsnp) < 0) { 399 region->last_ckp = *lsnp; 400 (void)time(®ion->time_ckp); 401 } 402 TXN_SYSTEM_UNLOCK(env); 403 404 return (0); 405} 406