1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1995, 1996 8 * The President and Fellows of Harvard University. All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * Margo Seltzer. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * $Id: txn_chkpt.c,v 12.53 2008/04/23 15:39:56 alanb Exp $ 38 */ 39 40#include "db_config.h" 41 42#include "db_int.h" 43#include "dbinc/log.h" 44#include "dbinc/mp.h" 45#include "dbinc/txn.h" 46 47/* 48 * __txn_checkpoint_pp -- 49 * ENV->txn_checkpoint pre/post processing. 50 * 51 * PUBLIC: int __txn_checkpoint_pp 52 * PUBLIC: __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t)); 53 */ 54int 55__txn_checkpoint_pp(dbenv, kbytes, minutes, flags) 56 DB_ENV *dbenv; 57 u_int32_t kbytes, minutes, flags; 58{ 59 DB_THREAD_INFO *ip; 60 ENV *env; 61 int ret; 62 63 env = dbenv->env; 64 65 ENV_REQUIRES_CONFIG(env, 66 env->tx_handle, "txn_checkpoint", DB_INIT_TXN); 67 68 /* 69 * On a replication client, all transactions are read-only; therefore, 70 * a checkpoint is a null-op. 71 * 72 * We permit txn_checkpoint, instead of just rendering it illegal, 73 * so that an application can just let a checkpoint thread continue 74 * to operate as it gets promoted or demoted between being a 75 * master and a client. 76 */ 77 if (IS_REP_CLIENT(env)) 78 return (0); 79 80 ENV_ENTER(env, ip); 81 REPLICATION_WRAP(env, 82 (__txn_checkpoint(env, kbytes, minutes, flags)), 0, ret); 83 ENV_LEAVE(env, ip); 84 return (ret); 85} 86 87/* 88 * __txn_checkpoint -- 89 * ENV->txn_checkpoint. 90 * 91 * PUBLIC: int __txn_checkpoint 92 * PUBLIC: __P((ENV *, u_int32_t, u_int32_t, u_int32_t)); 93 */ 94int 95__txn_checkpoint(env, kbytes, minutes, flags) 96 ENV *env; 97 u_int32_t kbytes, minutes, flags; 98{ 99 DB_LSN ckp_lsn, last_ckp; 100 DB_TXNMGR *mgr; 101 DB_TXNREGION *region; 102 REGENV *renv; 103 REGINFO *infop; 104 time_t last_ckp_time, now; 105 u_int32_t bytes, id, logflags, mbytes, op; 106 int ret; 107 108 DB_ASSERT(env, !IS_REP_CLIENT(env)); 109 ret = 0; 110 111 mgr = env->tx_handle; 112 region = mgr->reginfo.primary; 113 infop = env->reginfo; 114 renv = infop->primary; 115 /* 116 * No mutex is needed as envid is read-only once it is set. 117 */ 118 id = renv->envid; 119 120 /* 121 * The checkpoint LSN is an LSN such that all transactions begun before 122 * it are complete. Our first guess (corrected below based on the list 123 * of active transactions) is the last-written LSN. 124 */ 125 if ((ret = __log_current_lsn(env, &ckp_lsn, &mbytes, &bytes)) != 0) 126 return (ret); 127 128 if (!LF_ISSET(DB_FORCE)) { 129 /* Don't checkpoint a quiescent database. */ 130 if (bytes == 0 && mbytes == 0) 131 return (0); 132 133 /* 134 * If either kbytes or minutes is non-zero, then only take the 135 * checkpoint if more than "minutes" minutes have passed or if 136 * more than "kbytes" of log data have been written since the 137 * last checkpoint. 138 */ 139 if (kbytes != 0 && 140 mbytes * 1024 + bytes / 1024 >= (u_int32_t)kbytes) 141 goto do_ckp; 142 143 if (minutes != 0) { 144 (void)time(&now); 145 146 TXN_SYSTEM_LOCK(env); 147 last_ckp_time = region->time_ckp; 148 TXN_SYSTEM_UNLOCK(env); 149 150 if (now - last_ckp_time >= (time_t)(minutes * 60)) 151 goto do_ckp; 152 } 153 154 /* 155 * If we checked time and data and didn't go to checkpoint, 156 * we're done. 157 */ 158 if (minutes != 0 || kbytes != 0) 159 return (0); 160 } 161 162 /* 163 * We must single thread checkpoints otherwise the chk_lsn may get out 164 * of order. We need to capture the start of the earliest currently 165 * active transaction (chk_lsn) and then flush all buffers. While 166 * doing this we we could then be overtaken by another checkpoint that 167 * sees a later chk_lsn but competes first. An archive process could 168 * then remove a log this checkpoint depends on. 169 */ 170do_ckp: 171 MUTEX_LOCK(env, region->mtx_ckp); 172 if ((ret = __txn_getactive(env, &ckp_lsn)) != 0) 173 goto err; 174 175 /* 176 * Checkpoints in replication groups can cause performance problems. 177 * 178 * As on the master, checkpoint on the replica requires the cache be 179 * flushed. The problem occurs when a client has dirty cache pages 180 * to write when the checkpoint record arrives, and the client's PERM 181 * response is necessary in order to meet the system's durability 182 * guarantees. In this case, the master will have to wait until the 183 * client completes its cache flush and writes the checkpoint record 184 * before subsequent transactions can be committed. The delay may 185 * cause transactions to timeout waiting on client response, which 186 * can cause nasty ripple effects in the system's overall throughput. 187 * [#15338] 188 * 189 * First, we send a start-sync record when the checkpoint starts so 190 * clients can start flushing their cache in preparation for the 191 * arrival of the checkpoint record. 192 */ 193 if (LOGGING_ON(env) && 194 IS_REP_MASTER(env) && env->rep_handle->send != NULL) 195 (void)__rep_send_message(env, 196 DB_EID_BROADCAST, REP_START_SYNC, &ckp_lsn, NULL, 0, 0); 197 198 /* Flush the cache. */ 199 if (MPOOL_ON(env) && 200 (ret = __memp_sync_int( 201 env, NULL, 0, DB_SYNC_CHECKPOINT, NULL, NULL)) != 0) { 202 __db_err(env, ret, 203 "txn_checkpoint: failed to flush the buffer cache"); 204 goto err; 205 } 206 207 /* 208 * The client won't have more dirty pages to flush from its cache than 209 * the master did, but there may be differences between the hardware, 210 * I/O configuration and workload on the master and the client that 211 * can result in the client being unable to finish its cache flush as 212 * fast as the master. A way to avoid the problem is to pause after 213 * the master completes its checkpoint and before the actual checkpoint 214 * record is logged, giving the replicas additional time to finish. 215 * 216 * !!! 217 * Currently turned off when testing, because it makes the test suite 218 * take a long time to run. 219 */ 220#ifndef CONFIG_TEST 221 if (LOGGING_ON(env) && 222 IS_REP_MASTER(env) && env->rep_handle->send != NULL && 223 !LF_ISSET(DB_CKP_INTERNAL) && 224 env->rep_handle->region->chkpt_delay != 0) 225 __os_yield(env, 0, env->rep_handle->region->chkpt_delay); 226#endif 227 228 /* 229 * Because we can't be a replication client here, and because 230 * recovery (somewhat unusually) calls txn_checkpoint and expects 231 * it to write a log message, LOGGING_ON is the correct macro here. 232 */ 233 if (LOGGING_ON(env)) { 234 TXN_SYSTEM_LOCK(env); 235 last_ckp = region->last_ckp; 236 TXN_SYSTEM_UNLOCK(env); 237 /* 238 * Put out records for the open files before we log 239 * the checkpoint. The records are certain to be at 240 * or after ckp_lsn, but before the checkpoint record 241 * itself, so they're sure to be included if we start 242 * recovery from the ckp_lsn contained in this 243 * checkpoint. 244 */ 245 logflags = DB_LOG_CHKPNT; 246 /* 247 * If this is a normal checkpoint, log files as checkpoints. 248 * If we are recovering, only log as DBREG_RCLOSE if 249 * there are no prepared txns. Otherwise, it should 250 * stay as DBREG_CHKPNT. 251 */ 252 op = DBREG_CHKPNT; 253 if (!IS_RECOVERING(env)) 254 logflags |= DB_FLUSH; 255 else if (region->stat.st_nrestores == 0) 256 op = DBREG_RCLOSE; 257 if ((ret = __dbreg_log_files(env, op)) != 0 || 258 (ret = __txn_ckp_log(env, NULL, &ckp_lsn, logflags, 259 &ckp_lsn, &last_ckp, (int32_t)time(NULL), id, 0)) != 0) { 260 __db_err(env, ret, 261 "txn_checkpoint: log failed at LSN [%ld %ld]", 262 (long)ckp_lsn.file, (long)ckp_lsn.offset); 263 goto err; 264 } 265 266 if ((ret = __txn_updateckp(env, &ckp_lsn)) != 0) 267 goto err; 268 } 269 270err: MUTEX_UNLOCK(env, region->mtx_ckp); 271 return (ret); 272} 273 274/* 275 * __txn_getactive -- 276 * Find the oldest active transaction and figure out its "begin" LSN. 277 * This is the lowest LSN we can checkpoint, since any record written 278 * after it may be involved in a transaction and may therefore need 279 * to be undone in the case of an abort. 280 * 281 * We check both the file and offset for 0 since the lsn may be in 282 * transition. If it is then we don't care about this txn because it 283 * must be starting after we set the initial value of lsnp in the caller. 284 * All txns must initalize their begin_lsn before writing to the log. 285 * 286 * PUBLIC: int __txn_getactive __P((ENV *, DB_LSN *)); 287 */ 288int 289__txn_getactive(env, lsnp) 290 ENV *env; 291 DB_LSN *lsnp; 292{ 293 DB_TXNMGR *mgr; 294 DB_TXNREGION *region; 295 TXN_DETAIL *td; 296 297 mgr = env->tx_handle; 298 region = mgr->reginfo.primary; 299 300 TXN_SYSTEM_LOCK(env); 301 SH_TAILQ_FOREACH(td, ®ion->active_txn, links, __txn_detail) 302 if (td->begin_lsn.file != 0 && 303 td->begin_lsn.offset != 0 && 304 LOG_COMPARE(&td->begin_lsn, lsnp) < 0) 305 *lsnp = td->begin_lsn; 306 TXN_SYSTEM_UNLOCK(env); 307 308 return (0); 309} 310 311/* 312 * __txn_getckp -- 313 * Get the LSN of the last transaction checkpoint. 314 * 315 * PUBLIC: int __txn_getckp __P((ENV *, DB_LSN *)); 316 */ 317int 318__txn_getckp(env, lsnp) 319 ENV *env; 320 DB_LSN *lsnp; 321{ 322 DB_LSN lsn; 323 DB_TXNMGR *mgr; 324 DB_TXNREGION *region; 325 326 mgr = env->tx_handle; 327 region = mgr->reginfo.primary; 328 329 TXN_SYSTEM_LOCK(env); 330 lsn = region->last_ckp; 331 TXN_SYSTEM_UNLOCK(env); 332 333 if (IS_ZERO_LSN(lsn)) 334 return (DB_NOTFOUND); 335 336 *lsnp = lsn; 337 return (0); 338} 339 340/* 341 * __txn_updateckp -- 342 * Update the last_ckp field in the transaction region. This happens 343 * at the end of a normal checkpoint and also when a replication client 344 * receives a checkpoint record. 345 * 346 * PUBLIC: int __txn_updateckp __P((ENV *, DB_LSN *)); 347 */ 348int 349__txn_updateckp(env, lsnp) 350 ENV *env; 351 DB_LSN *lsnp; 352{ 353 DB_TXNMGR *mgr; 354 DB_TXNREGION *region; 355 356 mgr = env->tx_handle; 357 region = mgr->reginfo.primary; 358 359 /* 360 * We want to make sure last_ckp only moves forward; since we drop 361 * locks above and in log_put, it's possible for two calls to 362 * __txn_ckp_log to finish in a different order from how they were 363 * called. 364 */ 365 TXN_SYSTEM_LOCK(env); 366 if (LOG_COMPARE(®ion->last_ckp, lsnp) < 0) { 367 region->last_ckp = *lsnp; 368 (void)time(®ion->time_ckp); 369 } 370 TXN_SYSTEM_UNLOCK(env); 371 372 return (0); 373} 374