1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: log_put.c,v 12.70 2008/05/13 00:33:27 alexg Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/crypto.h" 13#include "dbinc/hmac.h" 14#include "dbinc/log.h" 15#include "dbinc/txn.h" 16 17static int __log_encrypt_record __P((ENV *, DBT *, HDR *, u_int32_t)); 18static int __log_file __P((ENV *, const DB_LSN *, char *, size_t)); 19static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t)); 20static int __log_flush_commit __P((ENV *, const DB_LSN *, u_int32_t)); 21static int __log_newfh __P((DB_LOG *, int)); 22static int __log_put_next __P((ENV *, 23 DB_LSN *, const DBT *, HDR *, DB_LSN *)); 24static int __log_putr __P((DB_LOG *, 25 DB_LSN *, const DBT *, u_int32_t, HDR *)); 26static int __log_write __P((DB_LOG *, void *, u_int32_t)); 27 28/* 29 * __log_put_pp -- 30 * ENV->log_put pre/post processing. 31 * 32 * PUBLIC: int __log_put_pp __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t)); 33 */ 34int 35__log_put_pp(dbenv, lsnp, udbt, flags) 36 DB_ENV *dbenv; 37 DB_LSN *lsnp; 38 const DBT *udbt; 39 u_int32_t flags; 40{ 41 DB_THREAD_INFO *ip; 42 ENV *env; 43 int ret; 44 45 env = dbenv->env; 46 47 ENV_REQUIRES_CONFIG(env, 48 env->lg_handle, "DB_ENV->log_put", DB_INIT_LOG); 49 50 /* Validate arguments: check for allowed flags. */ 51 if ((ret = __db_fchk(env, "DB_ENV->log_put", flags, 52 DB_LOG_CHKPNT | DB_LOG_COMMIT | 53 DB_FLUSH | DB_LOG_NOCOPY | DB_LOG_WRNOSYNC)) != 0) 54 return (ret); 55 56 /* DB_LOG_WRNOSYNC and DB_FLUSH are mutually exclusive. */ 57 if (LF_ISSET(DB_LOG_WRNOSYNC) && LF_ISSET(DB_FLUSH)) 58 return (__db_ferr(env, "DB_ENV->log_put", 1)); 59 60 /* Replication clients should never write log records. */ 61 if (IS_REP_CLIENT(env)) { 62 __db_errx(env, 63 "DB_ENV->log_put is illegal on replication clients"); 64 return (EINVAL); 65 } 66 67 ENV_ENTER(env, ip); 68 REPLICATION_WRAP(env, (__log_put(env, lsnp, udbt, flags)), 0, ret); 69 ENV_LEAVE(env, ip); 70 return (ret); 71} 72 73/* 74 * __log_put -- 75 * ENV->log_put. 76 * 77 * PUBLIC: int __log_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t)); 78 */ 79int 80__log_put(env, lsnp, udbt, flags) 81 ENV *env; 82 DB_LSN *lsnp; 83 const DBT *udbt; 84 u_int32_t flags; 85{ 86 DBT *dbt, t; 87 DB_CIPHER *db_cipher; 88 DB_LOG *dblp; 89 DB_LSN lsn, old_lsn; 90 DB_REP *db_rep; 91 HDR hdr; 92 LOG *lp; 93 REP *rep; 94 int lock_held, need_free, ret; 95 u_int8_t *key; 96 97 dblp = env->lg_handle; 98 lp = dblp->reginfo.primary; 99 db_cipher = env->crypto_handle; 100 db_rep = env->rep_handle; 101 if (db_rep != NULL) 102 rep = db_rep->region; 103 else 104 rep = NULL; 105 106 dbt = &t; 107 t = *udbt; 108 lock_held = need_free = 0; 109 ZERO_LSN(old_lsn); 110 hdr.len = hdr.prev = 0; 111 112#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) 113 /* 114 * If we are not a rep application, but are sharing a master rep env, 115 * we should not be writing log records. 116 */ 117 if (IS_REP_MASTER(env) && db_rep->send == NULL) { 118 __db_errx(env, "%s %s", 119 "Non-replication DB_ENV handle attempting", 120 "to modify a replicated environment"); 121 return (EINVAL); 122 } 123#endif 124 DB_ASSERT(env, !IS_REP_CLIENT(env)); 125 126 /* 127 * If we are coming from the logging code, we use an internal flag, 128 * DB_LOG_NOCOPY, because we know we can overwrite/encrypt the log 129 * record in place. Otherwise, if a user called log_put then we 130 * must copy it to new memory so that we know we can write it. 131 * 132 * We also must copy it to new memory if we are a replication master 133 * so that we retain an unencrypted copy of the log record to send 134 * to clients. 135 */ 136 if (!LF_ISSET(DB_LOG_NOCOPY) || IS_REP_MASTER(env)) { 137 if (CRYPTO_ON(env)) 138 t.size += db_cipher->adj_size(udbt->size); 139 if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0) 140 goto err; 141 need_free = 1; 142 memcpy(t.data, udbt->data, udbt->size); 143 } 144 if ((ret = __log_encrypt_record(env, dbt, &hdr, udbt->size)) != 0) 145 goto err; 146 if (CRYPTO_ON(env)) 147 key = db_cipher->mac_key; 148 else 149 key = NULL; 150 151 /* Before we grab the region lock, calculate the record's checksum. */ 152 if (lp->persist.version != DB_LOGVERSION) 153 __db_chksum(NULL, dbt->data, dbt->size, key, hdr.chksum); 154 else 155 __db_chksum(&hdr, dbt->data, dbt->size, key, hdr.chksum); 156 157 LOG_SYSTEM_LOCK(env); 158 lock_held = 1; 159 160 if ((ret = __log_put_next(env, &lsn, dbt, &hdr, &old_lsn)) != 0) 161 goto panic_check; 162 163 /* 164 * Assign the return LSN before dropping the region lock. Necessary 165 * in case the lsn is a begin_lsn from a TXN_DETAIL structure passed in 166 * by the logging routines. We use atomic 32-bit operations because 167 * during commit this will be a TXN_DETAIL visible_lsn field, and MVCC 168 * relies on reading the fields atomically. 169 */ 170 lsnp->file = lsn.file; 171 lsnp->offset = lsn.offset; 172 173#ifdef HAVE_REPLICATION 174 if (IS_REP_MASTER(env)) { 175 __rep_newfile_args nf_args; 176 DBT newfiledbt; 177 REP_BULK bulk; 178 size_t len; 179 u_int32_t ctlflags; 180 u_int8_t buf[__REP_NEWFILE_SIZE]; 181 182 /* 183 * Replication masters need to drop the lock to send messages, 184 * but want to drop and reacquire it a minimal number of times. 185 */ 186 ctlflags = LF_ISSET(DB_LOG_COMMIT | DB_LOG_CHKPNT) ? 187 REPCTL_PERM : 0; 188 /* 189 * If using leases, keep track of our last PERM lsn. 190 * Set this on a master under the log lock. 191 */ 192 if (IS_USING_LEASES(env) && 193 FLD_ISSET(ctlflags, REPCTL_PERM)) 194 lp->max_perm_lsn = lsn; 195 LOG_SYSTEM_UNLOCK(env); 196 lock_held = 0; 197 if (LF_ISSET(DB_FLUSH)) 198 ctlflags |= REPCTL_FLUSH; 199 200 /* 201 * If we changed files and we're in a replicated environment, 202 * we need to inform our clients now that we've dropped the 203 * region lock. 204 * 205 * Note that a failed NEWFILE send is a dropped message that 206 * our client can handle, so we can ignore it. It's possible 207 * that the record we already put is a commit, so we don't just 208 * want to return failure. 209 */ 210 if (!IS_ZERO_LSN(old_lsn)) { 211 memset(&newfiledbt, 0, sizeof(newfiledbt)); 212 nf_args.version = lp->persist.version; 213 (void)__rep_newfile_marshal(env, &nf_args, 214 buf, __REP_NEWFILE_SIZE, &len); 215 DB_INIT_DBT(newfiledbt, buf, len); 216 (void)__rep_send_message(env, DB_EID_BROADCAST, 217 REP_NEWFILE, &old_lsn, &newfiledbt, 0, 0); 218 } 219 220 /* 221 * If we're doing bulk processing put it in the bulk buffer. 222 */ 223 ret = 0; 224 if (FLD_ISSET(rep->config, REP_C_BULK)) { 225 /* 226 * Bulk could have been turned on by another process. 227 * If so, set the address into the bulk region now. 228 */ 229 if (db_rep->bulk == NULL) 230 db_rep->bulk = R_ADDR(&dblp->reginfo, 231 lp->bulk_buf); 232 memset(&bulk, 0, sizeof(bulk)); 233 bulk.addr = db_rep->bulk; 234 bulk.offp = &lp->bulk_off; 235 bulk.len = lp->bulk_len; 236 bulk.lsn = lsn; 237 bulk.type = REP_BULK_LOG; 238 bulk.eid = DB_EID_BROADCAST; 239 bulk.flagsp = &lp->bulk_flags; 240 ret = __rep_bulk_message(env, &bulk, NULL, 241 &lsn, udbt, ctlflags); 242 } 243 if (!FLD_ISSET(rep->config, REP_C_BULK) || 244 ret == DB_REP_BULKOVF) { 245 /* 246 * Then send the log record itself on to our clients. 247 */ 248 /* 249 * !!! 250 * In the crypto case, we MUST send the udbt, not the 251 * now-encrypted dbt. Clients have no way to decrypt 252 * without the header. 253 */ 254 ret = __rep_send_message(env, DB_EID_BROADCAST, 255 REP_LOG, &lsn, udbt, ctlflags, 0); 256 } 257 /* 258 * If the send fails and we're a commit or checkpoint, 259 * there's nothing we can do; the record's in the log. 260 * Flush it, even if we're running with TXN_NOSYNC, 261 * on the grounds that it should be in durable 262 * form somewhere. 263 * 264 * If the send fails with this perm record and leases 265 * are in use, we need to forcibly expire all lease 266 * grants to prevent authoritative reads. 267 */ 268 if (ret != 0 && FLD_ISSET(ctlflags, REPCTL_PERM)) { 269 LF_SET(DB_FLUSH); 270 if (IS_USING_LEASES(env)) 271 (void)__rep_lease_expire(env, 0); 272 } 273 /* 274 * We ignore send failures so reset 'ret' to 0 here. 275 * We needed to check special return values from 276 * bulk transfer and errors from either bulk or normal 277 * message sending need flushing on perm records. But 278 * otherwise we need to ignore it and reset it now. 279 */ 280 ret = 0; 281 } 282#endif 283 284 /* 285 * If needed, do a flush. Note that failures at this point 286 * are only permissible if we know we haven't written a commit 287 * record; __log_flush_commit is responsible for enforcing this. 288 * 289 * If a flush is not needed, see if WRITE_NOSYNC was set and we 290 * need to write out the log buffer. 291 */ 292 if (LF_ISSET(DB_FLUSH | DB_LOG_WRNOSYNC)) { 293 if (!lock_held) { 294 LOG_SYSTEM_LOCK(env); 295 lock_held = 1; 296 } 297 if ((ret = __log_flush_commit(env, &lsn, flags)) != 0) 298 goto panic_check; 299 } 300 301 /* 302 * If flushed a checkpoint record, reset the "bytes since the last 303 * checkpoint" counters. 304 */ 305 if (LF_ISSET(DB_LOG_CHKPNT)) 306 lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0; 307 308 /* Increment count of records added to the log. */ 309 STAT(++lp->stat.st_record); 310 311 if (0) { 312panic_check: /* 313 * Writing log records cannot fail if we're a replication 314 * master. The reason is that once we send the record to 315 * replication clients, the transaction can no longer 316 * abort, otherwise the master would be out of sync with 317 * the rest of the replication group. Panic the system. 318 */ 319 if (ret != 0 && IS_REP_MASTER(env)) 320 ret = __env_panic(env, ret); 321 } 322 323err: if (lock_held) 324 LOG_SYSTEM_UNLOCK(env); 325 if (need_free) 326 __os_free(env, dbt->data); 327 328 /* 329 * If auto-remove is set and we switched files, remove unnecessary 330 * log files. 331 */ 332 if (ret == 0 && !IS_ZERO_LSN(old_lsn) && lp->db_log_autoremove) 333 __log_autoremove(env); 334 335 return (ret); 336} 337 338/* 339 * __log_current_lsn -- 340 * Return the current LSN. 341 * 342 * PUBLIC: int __log_current_lsn 343 * PUBLIC: __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *)); 344 */ 345int 346__log_current_lsn(env, lsnp, mbytesp, bytesp) 347 ENV *env; 348 DB_LSN *lsnp; 349 u_int32_t *mbytesp, *bytesp; 350{ 351 DB_LOG *dblp; 352 LOG *lp; 353 354 dblp = env->lg_handle; 355 lp = dblp->reginfo.primary; 356 357 LOG_SYSTEM_LOCK(env); 358 359 /* 360 * We need the LSN of the last entry in the log. 361 * 362 * Typically, it's easy to get the last written LSN, you simply look 363 * at the current log pointer and back up the number of bytes of the 364 * last log record. However, if the last thing we did was write the 365 * log header of a new log file, then, this doesn't work, so we return 366 * the first log record that will be written in this new file. 367 */ 368 *lsnp = lp->lsn; 369 if (lp->lsn.offset > lp->len) 370 lsnp->offset -= lp->len; 371 372 /* 373 * Since we're holding the log region lock, return the bytes put into 374 * the log since the last checkpoint, transaction checkpoint needs it. 375 * 376 * We add the current buffer offset so as to count bytes that have not 377 * yet been written, but are sitting in the log buffer. 378 */ 379 if (mbytesp != NULL) { 380 *mbytesp = lp->stat.st_wc_mbytes; 381 *bytesp = (u_int32_t)(lp->stat.st_wc_bytes + lp->b_off); 382 } 383 384 LOG_SYSTEM_UNLOCK(env); 385 386 return (0); 387} 388 389/* 390 * __log_put_next -- 391 * Put the given record as the next in the log, wherever that may 392 * turn out to be. 393 */ 394static int 395__log_put_next(env, lsn, dbt, hdr, old_lsnp) 396 ENV *env; 397 DB_LSN *lsn; 398 const DBT *dbt; 399 HDR *hdr; 400 DB_LSN *old_lsnp; 401{ 402 DB_LOG *dblp; 403 DB_LSN old_lsn; 404 LOG *lp; 405 int adv_file, newfile, ret; 406 407 dblp = env->lg_handle; 408 lp = dblp->reginfo.primary; 409 410 /* 411 * Save a copy of lp->lsn before we might decide to switch log 412 * files and change it. If we do switch log files, and we're 413 * doing replication, we'll need to tell our clients about the 414 * switch, and they need to receive a NEWFILE message 415 * with this "would-be" LSN in order to know they're not 416 * missing any log records. 417 */ 418 old_lsn = lp->lsn; 419 newfile = 0; 420 adv_file = 0; 421 /* 422 * If our current log is at an older version and we want to write 423 * a record then we need to advance the log. 424 */ 425 if (lp->persist.version != DB_LOGVERSION) { 426 __log_set_version(env, DB_LOGVERSION); 427 adv_file = 1; 428 } 429 430 /* 431 * If this information won't fit in the file, or if we're a 432 * replication client environment and have been told to do so, 433 * swap files. 434 */ 435 if (adv_file || lp->lsn.offset == 0 || 436 lp->lsn.offset + hdr->size + dbt->size > lp->log_size) { 437 if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_size) { 438 __db_errx(env, 439 "DB_ENV->log_put: record larger than maximum file size (%lu > %lu)", 440 (u_long)hdr->size + sizeof(LOGP) + dbt->size, 441 (u_long)lp->log_size); 442 return (EINVAL); 443 } 444 445 if ((ret = __log_newfile(dblp, NULL, 0, 0)) != 0) 446 return (ret); 447 448 /* 449 * Flag that we switched files, in case we're a master 450 * and need to send this information to our clients. 451 * We postpone doing the actual send until we can 452 * safely release the log region lock and are doing so 453 * anyway. 454 */ 455 newfile = 1; 456 } 457 458 /* If we switched log files, let our caller know where. */ 459 if (newfile) 460 *old_lsnp = old_lsn; 461 462 /* Actually put the record. */ 463 return (__log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len, hdr)); 464} 465 466/* 467 * __log_flush_commit -- 468 * Flush a record. 469 */ 470static int 471__log_flush_commit(env, lsnp, flags) 472 ENV *env; 473 const DB_LSN *lsnp; 474 u_int32_t flags; 475{ 476 DB_LOG *dblp; 477 DB_LSN flush_lsn; 478 LOG *lp; 479 int ret; 480 481 dblp = env->lg_handle; 482 lp = dblp->reginfo.primary; 483 flush_lsn = *lsnp; 484 485 ret = 0; 486 487 /* 488 * DB_FLUSH: 489 * Flush a record for which the DB_FLUSH flag to log_put was set. 490 * 491 * DB_LOG_WRNOSYNC: 492 * If there's anything in the current log buffer, write it out. 493 */ 494 if (LF_ISSET(DB_FLUSH)) 495 ret = __log_flush_int(dblp, &flush_lsn, 1); 496 else if (!lp->db_log_inmemory && lp->b_off != 0) 497 if ((ret = __log_write(dblp, 498 dblp->bufp, (u_int32_t)lp->b_off)) == 0) 499 lp->b_off = 0; 500 501 /* 502 * If a flush supporting a transaction commit fails, we must abort the 503 * transaction. (If we aren't doing a commit, return the failure; if 504 * if the commit we care about made it to disk successfully, we just 505 * ignore the failure, because there's no way to undo the commit.) 506 */ 507 if (ret == 0 || !LF_ISSET(DB_LOG_COMMIT)) 508 return (ret); 509 510 if (flush_lsn.file != lp->lsn.file || flush_lsn.offset < lp->w_off) 511 return (0); 512 513 /* 514 * Else, make sure that the commit record does not get out after we 515 * abort the transaction. Do this by overwriting the commit record 516 * in the buffer. (Note that other commits in this buffer will wait 517 * until a successful write happens, we do not wake them.) We point 518 * at the right part of the buffer and write an abort record over the 519 * commit. We must then try and flush the buffer again, since the 520 * interesting part of the buffer may have actually made it out to 521 * disk before there was a failure, we can't know for sure. 522 */ 523 if (__txn_force_abort(env, 524 dblp->bufp + flush_lsn.offset - lp->w_off) == 0) 525 (void)__log_flush_int(dblp, &flush_lsn, 0); 526 527 return (ret); 528} 529 530/* 531 * __log_newfile -- 532 * Initialize and switch to a new log file. (Note that this is 533 * called both when no log yet exists and when we fill a log file.) 534 * 535 * PUBLIC: int __log_newfile __P((DB_LOG *, DB_LSN *, u_int32_t, u_int32_t)); 536 */ 537int 538__log_newfile(dblp, lsnp, logfile, version) 539 DB_LOG *dblp; 540 DB_LSN *lsnp; 541 u_int32_t logfile; 542 u_int32_t version; 543{ 544 DBT t; 545 DB_CIPHER *db_cipher; 546 DB_LSN lsn; 547 ENV *env; 548 HDR hdr; 549 LOG *lp; 550 LOGP *tpersist; 551 int need_free, ret; 552 u_int32_t lastoff; 553 size_t tsize; 554 555 env = dblp->env; 556 lp = dblp->reginfo.primary; 557 558 /* 559 * If we're not specifying a specific log file number and we're 560 * not at the beginning of a file already, start a new one. 561 */ 562 if (logfile == 0 && lp->lsn.offset != 0) { 563 /* 564 * Flush the log so this file is out and can be closed. We 565 * cannot release the region lock here because we need to 566 * protect the end of the file while we switch. In 567 * particular, a thread with a smaller record than ours 568 * could detect that there is space in the log. Even 569 * blocking that event by declaring the file full would 570 * require all threads to wait here so that the lsn.file 571 * can be moved ahead after the flush completes. This 572 * probably can be changed if we had an lsn for the 573 * previous file and one for the current, but it does not 574 * seem like this would get much more throughput, if any. 575 */ 576 if ((ret = __log_flush_int(dblp, NULL, 0)) != 0) 577 return (ret); 578 579 /* 580 * Save the last known offset from the previous file, we'll 581 * need it to initialize the persistent header information. 582 */ 583 lastoff = lp->lsn.offset; 584 585 /* Point the current LSN to the new file. */ 586 ++lp->lsn.file; 587 lp->lsn.offset = 0; 588 589 /* Reset the file write offset. */ 590 lp->w_off = 0; 591 } else 592 lastoff = 0; 593 594 /* 595 * Replication may require we reset the log file name space entirely. 596 * In that case we also force a file switch so that replication can 597 * clean up old files. 598 */ 599 if (logfile != 0) { 600 lp->lsn.file = logfile; 601 lp->lsn.offset = 0; 602 lp->w_off = 0; 603 if (lp->db_log_inmemory) { 604 lsn = lp->lsn; 605 (void)__log_zero(env, &lsn); 606 } else { 607 lp->s_lsn = lp->lsn; 608 if ((ret = __log_newfh(dblp, 1)) != 0) 609 return (ret); 610 } 611 } 612 613 DB_ASSERT(env, lp->db_log_inmemory || lp->b_off == 0); 614 if (lp->db_log_inmemory && 615 (ret = __log_inmem_newfile(dblp, lp->lsn.file)) != 0) 616 return (ret); 617 618 /* 619 * Insert persistent information as the first record in every file. 620 * Note that the previous length is wrong for the very first record 621 * of the log, but that's okay, we check for it during retrieval. 622 */ 623 memset(&t, 0, sizeof(t)); 624 memset(&hdr, 0, sizeof(HDR)); 625 626 need_free = 0; 627 tsize = sizeof(LOGP); 628 db_cipher = env->crypto_handle; 629 if (CRYPTO_ON(env)) 630 tsize += db_cipher->adj_size(tsize); 631 if ((ret = __os_calloc(env, 1, tsize, &tpersist)) != 0) 632 return (ret); 633 need_free = 1; 634 /* 635 * If we're told what version to make this file, then we 636 * need to be at that version. Update here. 637 */ 638 if (version != 0) { 639 __log_set_version(env, version); 640 if ((ret = __env_init_rec(env, version)) != 0) 641 goto err; 642 } 643 lp->persist.log_size = lp->log_size = lp->log_nsize; 644 memcpy(tpersist, &lp->persist, sizeof(LOGP)); 645 DB_SET_DBT(t, tpersist, tsize); 646 if (LOG_SWAPPED(env)) 647 __log_persistswap(tpersist); 648 649 if ((ret = 650 __log_encrypt_record(env, &t, &hdr, (u_int32_t)tsize)) != 0) 651 goto err; 652 if (lp->persist.version != DB_LOGVERSION) 653 __db_chksum(NULL, t.data, t.size, 654 (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL, hdr.chksum); 655 else 656 __db_chksum(&hdr, t.data, t.size, 657 (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL, hdr.chksum); 658 659 if ((ret = __log_putr(dblp, &lsn, 660 &t, lastoff == 0 ? 0 : lastoff - lp->len, &hdr)) != 0) 661 goto err; 662 663 /* Update the LSN information returned to the caller. */ 664 if (lsnp != NULL) 665 *lsnp = lp->lsn; 666 667err: if (need_free) 668 __os_free(env, tpersist); 669 return (ret); 670} 671 672/* 673 * __log_putr -- 674 * Actually put a record into the log. 675 */ 676static int 677__log_putr(dblp, lsn, dbt, prev, h) 678 DB_LOG *dblp; 679 DB_LSN *lsn; 680 const DBT *dbt; 681 u_int32_t prev; 682 HDR *h; 683{ 684 DB_CIPHER *db_cipher; 685 DB_LSN f_lsn; 686 ENV *env; 687 HDR tmp, *hdr; 688 LOG *lp; 689 int ret, t_ret; 690 size_t b_off, nr; 691 u_int32_t w_off; 692 693 env = dblp->env; 694 lp = dblp->reginfo.primary; 695 696 /* 697 * If we weren't given a header, use a local one. 698 */ 699 db_cipher = env->crypto_handle; 700 if (h == NULL) { 701 hdr = &tmp; 702 memset(hdr, 0, sizeof(HDR)); 703 if (CRYPTO_ON(env)) 704 hdr->size = HDR_CRYPTO_SZ; 705 else 706 hdr->size = HDR_NORMAL_SZ; 707 } else 708 hdr = h; 709 710 /* Save our position in case we fail. */ 711 b_off = lp->b_off; 712 w_off = lp->w_off; 713 f_lsn = lp->f_lsn; 714 715 /* 716 * Initialize the header. If we just switched files, lsn.offset will 717 * be 0, and what we really want is the offset of the previous record 718 * in the previous file. Fortunately, prev holds the value we want. 719 */ 720 hdr->prev = prev; 721 hdr->len = (u_int32_t)hdr->size + dbt->size; 722 723 /* 724 * If we were passed in a nonzero checksum, our caller calculated 725 * the checksum before acquiring the log mutex, as an optimization. 726 * 727 * If our caller calculated a real checksum of 0, we'll needlessly 728 * recalculate it. C'est la vie; there's no out-of-bounds value 729 * here. 730 */ 731 if (hdr->chksum[0] == 0) 732 if (lp->persist.version != DB_LOGVERSION) 733 __db_chksum(NULL, dbt->data, dbt->size, 734 (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL, 735 hdr->chksum); 736 else 737 __db_chksum(hdr, dbt->data, dbt->size, 738 (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL, 739 hdr->chksum); 740 else if (lp->persist.version == DB_LOGVERSION) { 741 /* 742 * We need to correct for prev and len since they are not 743 * set before here. 744 */ 745 LOG_HDR_SUM(CRYPTO_ON(env), hdr, hdr->chksum); 746 } 747 748 if (lp->db_log_inmemory && (ret = __log_inmem_chkspace(dblp, 749 (u_int32_t)hdr->size + dbt->size)) != 0) 750 goto err; 751 752 /* 753 * The offset into the log file at this point is the LSN where 754 * we're about to put this record, and is the LSN the caller wants. 755 */ 756 *lsn = lp->lsn; 757 758 nr = hdr->size; 759 if (LOG_SWAPPED(env)) 760 __log_hdrswap(hdr, CRYPTO_ON(env)); 761 762 /* nr can't overflow a 32 bit value - header size is internal. */ 763 ret = __log_fill(dblp, lsn, hdr, (u_int32_t)nr); 764 765 if (LOG_SWAPPED(env)) 766 __log_hdrswap(hdr, CRYPTO_ON(env)); 767 768 if (ret != 0) 769 goto err; 770 771 if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0) 772 goto err; 773 774 lp->len = (u_int32_t)(hdr->size + dbt->size); 775 lp->lsn.offset += lp->len; 776 return (0); 777err: 778 /* 779 * If we wrote more than one buffer before failing, get the 780 * first one back. The extra buffers will fail the checksums 781 * and be ignored. 782 */ 783 if (w_off + lp->buffer_size < lp->w_off) { 784 DB_ASSERT(env, !lp->db_log_inmemory); 785 if ((t_ret = __os_seek(env, dblp->lfhp, 0, 0, w_off)) != 0 || 786 (t_ret = __os_read(env, dblp->lfhp, dblp->bufp, 787 b_off, &nr)) != 0) 788 return (__env_panic(env, t_ret)); 789 if (nr != b_off) { 790 __db_errx(env, "Short read while restoring log"); 791 return (__env_panic(env, EIO)); 792 } 793 } 794 795 /* Reset to where we started. */ 796 lp->w_off = w_off; 797 lp->b_off = b_off; 798 lp->f_lsn = f_lsn; 799 800 return (ret); 801} 802 803/* 804 * __log_flush_pp -- 805 * ENV->log_flush pre/post processing. 806 * 807 * PUBLIC: int __log_flush_pp __P((DB_ENV *, const DB_LSN *)); 808 */ 809int 810__log_flush_pp(dbenv, lsn) 811 DB_ENV *dbenv; 812 const DB_LSN *lsn; 813{ 814 DB_THREAD_INFO *ip; 815 ENV *env; 816 int ret; 817 818 env = dbenv->env; 819 820 ENV_REQUIRES_CONFIG(env, 821 env->lg_handle, "DB_ENV->log_flush", DB_INIT_LOG); 822 823 ENV_ENTER(env, ip); 824 REPLICATION_WRAP(env, (__log_flush(env, lsn)), 0, ret); 825 ENV_LEAVE(env, ip); 826 return (ret); 827} 828 829/* 830 * See if we need to wait. s_lsn is not locked so some care is needed. 831 * The sync point can only move forward. The lsnp->file cannot be 832 * greater than the s_lsn.file. If the file we want is in the past 833 * we are done. If the file numbers are the same check the offset. 834 * This all assumes we can read an 32-bit quantity in one state or 835 * the other, not in transition. 836 */ 837#define ALREADY_FLUSHED(lp, lsnp) \ 838 (((lp)->s_lsn.file > (lsnp)->file) || \ 839 ((lp)->s_lsn.file == (lsnp)->file && \ 840 (lp)->s_lsn.offset > (lsnp)->offset)) 841 842/* 843 * __log_flush -- 844 * ENV->log_flush 845 * 846 * PUBLIC: int __log_flush __P((ENV *, const DB_LSN *)); 847 */ 848int 849__log_flush(env, lsn) 850 ENV *env; 851 const DB_LSN *lsn; 852{ 853 DB_LOG *dblp; 854 LOG *lp; 855 int ret; 856 857 dblp = env->lg_handle; 858 lp = dblp->reginfo.primary; 859 if (lsn != NULL && ALREADY_FLUSHED(lp, lsn)) 860 return (0); 861 LOG_SYSTEM_LOCK(env); 862 ret = __log_flush_int(dblp, lsn, 1); 863 LOG_SYSTEM_UNLOCK(env); 864 return (ret); 865} 866 867/* 868 * __log_flush_int -- 869 * Write all records less than or equal to the specified LSN; internal 870 * version. 871 * 872 * PUBLIC: int __log_flush_int __P((DB_LOG *, const DB_LSN *, int)); 873 */ 874int 875__log_flush_int(dblp, lsnp, release) 876 DB_LOG *dblp; 877 const DB_LSN *lsnp; 878 int release; 879{ 880 struct __db_commit *commit; 881 ENV *env; 882 DB_LSN flush_lsn, f_lsn; 883 LOG *lp; 884 size_t b_off; 885 u_int32_t ncommit, w_off; 886 int do_flush, first, ret; 887 888 env = dblp->env; 889 lp = dblp->reginfo.primary; 890 ncommit = 0; 891 ret = 0; 892 893 if (lp->db_log_inmemory) { 894 lp->s_lsn = lp->lsn; 895 STAT(++lp->stat.st_scount); 896 return (0); 897 } 898 899 /* 900 * If no LSN specified, flush the entire log by setting the flush LSN 901 * to the last LSN written in the log. Otherwise, check that the LSN 902 * isn't a non-existent record for the log. 903 */ 904 if (lsnp == NULL) { 905 flush_lsn.file = lp->lsn.file; 906 flush_lsn.offset = lp->lsn.offset - lp->len; 907 } else if (lsnp->file > lp->lsn.file || 908 (lsnp->file == lp->lsn.file && 909 lsnp->offset > lp->lsn.offset - lp->len)) { 910 __db_errx(env, 911 "DB_ENV->log_flush: LSN of %lu/%lu past current end-of-log of %lu/%lu", 912 (u_long)lsnp->file, (u_long)lsnp->offset, 913 (u_long)lp->lsn.file, (u_long)lp->lsn.offset); 914 __db_errx(env, "%s %s %s", 915 "Database environment corrupt; the wrong log files may", 916 "have been removed or incompatible database files imported", 917 "from another environment"); 918 return (__env_panic(env, DB_RUNRECOVERY)); 919 } else { 920 if (ALREADY_FLUSHED(lp, lsnp)) 921 return (0); 922 flush_lsn = *lsnp; 923 } 924 925 /* 926 * If a flush is in progress and we're allowed to do so, drop 927 * the region lock and block waiting for the next flush. 928 */ 929 if (release && lp->in_flush != 0) { 930 if ((commit = SH_TAILQ_FIRST( 931 &lp->free_commits, __db_commit)) == NULL) { 932 if ((ret = __env_alloc(&dblp->reginfo, 933 sizeof(struct __db_commit), &commit)) != 0) 934 goto flush; 935 memset(commit, 0, sizeof(*commit)); 936 if ((ret = __mutex_alloc(env, MTX_TXN_COMMIT, 937 DB_MUTEX_SELF_BLOCK, &commit->mtx_txnwait)) != 0) { 938 __env_alloc_free(&dblp->reginfo, commit); 939 return (ret); 940 } 941 MUTEX_LOCK(env, commit->mtx_txnwait); 942 } else 943 SH_TAILQ_REMOVE( 944 &lp->free_commits, commit, links, __db_commit); 945 946 lp->ncommit++; 947 948 /* 949 * Flushes may be requested out of LSN order; be 950 * sure we only move lp->t_lsn forward. 951 */ 952 if (LOG_COMPARE(&lp->t_lsn, &flush_lsn) < 0) 953 lp->t_lsn = flush_lsn; 954 955 commit->lsn = flush_lsn; 956 SH_TAILQ_INSERT_HEAD( 957 &lp->commits, commit, links, __db_commit); 958 LOG_SYSTEM_UNLOCK(env); 959 /* Wait here for the in-progress flush to finish. */ 960 MUTEX_LOCK(env, commit->mtx_txnwait); 961 LOG_SYSTEM_LOCK(env); 962 963 lp->ncommit--; 964 /* 965 * Grab the flag before freeing the struct to see if 966 * we need to flush the log to commit. If so, 967 * use the maximal lsn for any committing thread. 968 */ 969 do_flush = F_ISSET(commit, DB_COMMIT_FLUSH); 970 F_CLR(commit, DB_COMMIT_FLUSH); 971 SH_TAILQ_INSERT_HEAD( 972 &lp->free_commits, commit, links, __db_commit); 973 if (do_flush) { 974 lp->in_flush--; 975 flush_lsn = lp->t_lsn; 976 } else 977 return (0); 978 } 979 980 /* 981 * Protect flushing with its own mutex so we can release 982 * the region lock except during file switches. 983 */ 984flush: MUTEX_LOCK(env, lp->mtx_flush); 985 986 /* 987 * If the LSN is less than or equal to the last-sync'd LSN, we're done. 988 * Note, the last-sync LSN saved in s_lsn is the LSN of the first byte 989 * after the byte we absolutely know was written to disk, so the test 990 * is <, not <=. 991 */ 992 if (flush_lsn.file < lp->s_lsn.file || 993 (flush_lsn.file == lp->s_lsn.file && 994 flush_lsn.offset < lp->s_lsn.offset)) { 995 MUTEX_UNLOCK(env, lp->mtx_flush); 996 goto done; 997 } 998 999 /* 1000 * We may need to write the current buffer. We have to write the 1001 * current buffer if the flush LSN is greater than or equal to the 1002 * buffer's starting LSN. 1003 * 1004 * Otherwise, it's still possible that this thread may never have 1005 * written to this log file. Acquire a file descriptor if we don't 1006 * already have one. 1007 */ 1008 if (lp->b_off != 0 && LOG_COMPARE(&flush_lsn, &lp->f_lsn) >= 0) { 1009 if ((ret = __log_write(dblp, 1010 dblp->bufp, (u_int32_t)lp->b_off)) != 0) { 1011 MUTEX_UNLOCK(env, lp->mtx_flush); 1012 goto done; 1013 } 1014 1015 lp->b_off = 0; 1016 } else if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file) 1017 if ((ret = __log_newfh(dblp, 0)) != 0) { 1018 MUTEX_UNLOCK(env, lp->mtx_flush); 1019 goto done; 1020 } 1021 1022 /* 1023 * We are going to flush, release the region. 1024 * First get the current state of the buffer since 1025 * another write may come in, but we may not flush it. 1026 */ 1027 b_off = lp->b_off; 1028 w_off = lp->w_off; 1029 f_lsn = lp->f_lsn; 1030 lp->in_flush++; 1031 if (release) 1032 LOG_SYSTEM_UNLOCK(env); 1033 1034 /* Sync all writes to disk. */ 1035 if ((ret = __os_fsync(env, dblp->lfhp)) != 0) { 1036 MUTEX_UNLOCK(env, lp->mtx_flush); 1037 if (release) 1038 LOG_SYSTEM_LOCK(env); 1039 ret = __env_panic(env, ret); 1040 return (ret); 1041 } 1042 1043 /* 1044 * Set the last-synced LSN. 1045 * This value must be set to the LSN past the last complete 1046 * record that has been flushed. This is at least the first 1047 * lsn, f_lsn. If the buffer is empty, b_off == 0, then 1048 * we can move up to write point since the first lsn is not 1049 * set for the new buffer. 1050 */ 1051 lp->s_lsn = f_lsn; 1052 if (b_off == 0) 1053 lp->s_lsn.offset = w_off; 1054 1055 MUTEX_UNLOCK(env, lp->mtx_flush); 1056 if (release) 1057 LOG_SYSTEM_LOCK(env); 1058 1059 lp->in_flush--; 1060 STAT(++lp->stat.st_scount); 1061 1062 /* 1063 * How many flush calls (usually commits) did this call actually sync? 1064 * At least one, if it got here. 1065 */ 1066 ncommit = 1; 1067done: 1068 if (lp->ncommit != 0) { 1069 first = 1; 1070 SH_TAILQ_FOREACH(commit, &lp->commits, links, __db_commit) 1071 if (LOG_COMPARE(&lp->s_lsn, &commit->lsn) > 0) { 1072 MUTEX_UNLOCK(env, commit->mtx_txnwait); 1073 SH_TAILQ_REMOVE( 1074 &lp->commits, commit, links, __db_commit); 1075 ncommit++; 1076 } else if (first == 1) { 1077 F_SET(commit, DB_COMMIT_FLUSH); 1078 MUTEX_UNLOCK(env, commit->mtx_txnwait); 1079 SH_TAILQ_REMOVE( 1080 &lp->commits, commit, links, __db_commit); 1081 /* 1082 * This thread will wake and flush. 1083 * If another thread commits and flushes 1084 * first we will waste a trip trough the 1085 * mutex. 1086 */ 1087 lp->in_flush++; 1088 first = 0; 1089 } 1090 } 1091#ifdef HAVE_STATISTICS 1092 if (lp->stat.st_maxcommitperflush < ncommit) 1093 lp->stat.st_maxcommitperflush = ncommit; 1094 if (lp->stat.st_mincommitperflush > ncommit || 1095 lp->stat.st_mincommitperflush == 0) 1096 lp->stat.st_mincommitperflush = ncommit; 1097#endif 1098 1099 return (ret); 1100} 1101 1102/* 1103 * __log_fill -- 1104 * Write information into the log. 1105 */ 1106static int 1107__log_fill(dblp, lsn, addr, len) 1108 DB_LOG *dblp; 1109 DB_LSN *lsn; 1110 void *addr; 1111 u_int32_t len; 1112{ 1113 LOG *lp; 1114 u_int32_t bsize, nrec; 1115 size_t nw, remain; 1116 int ret; 1117 1118 lp = dblp->reginfo.primary; 1119 bsize = lp->buffer_size; 1120 1121 if (lp->db_log_inmemory) { 1122 __log_inmem_copyin(dblp, lp->b_off, addr, len); 1123 lp->b_off = (lp->b_off + len) % lp->buffer_size; 1124 return (0); 1125 } 1126 1127 while (len > 0) { /* Copy out the data. */ 1128 /* 1129 * If we're beginning a new buffer, note the user LSN to which 1130 * the first byte of the buffer belongs. We have to know this 1131 * when flushing the buffer so that we know if the in-memory 1132 * buffer needs to be flushed. 1133 */ 1134 if (lp->b_off == 0) 1135 lp->f_lsn = *lsn; 1136 1137 /* 1138 * If we're on a buffer boundary and the data is big enough, 1139 * copy as many records as we can directly from the data. 1140 */ 1141 if (lp->b_off == 0 && len >= bsize) { 1142 nrec = len / bsize; 1143 if ((ret = __log_write(dblp, addr, nrec * bsize)) != 0) 1144 return (ret); 1145 addr = (u_int8_t *)addr + nrec * bsize; 1146 len -= nrec * bsize; 1147 STAT(++lp->stat.st_wcount_fill); 1148 continue; 1149 } 1150 1151 /* Figure out how many bytes we can copy this time. */ 1152 remain = bsize - lp->b_off; 1153 nw = remain > len ? len : remain; 1154 memcpy(dblp->bufp + lp->b_off, addr, nw); 1155 addr = (u_int8_t *)addr + nw; 1156 len -= (u_int32_t)nw; 1157 lp->b_off += nw; 1158 1159 /* If we fill the buffer, flush it. */ 1160 if (lp->b_off == bsize) { 1161 if ((ret = __log_write(dblp, dblp->bufp, bsize)) != 0) 1162 return (ret); 1163 lp->b_off = 0; 1164 STAT(++lp->stat.st_wcount_fill); 1165 } 1166 } 1167 return (0); 1168} 1169 1170/* 1171 * __log_write -- 1172 * Write the log buffer to disk. 1173 */ 1174static int 1175__log_write(dblp, addr, len) 1176 DB_LOG *dblp; 1177 void *addr; 1178 u_int32_t len; 1179{ 1180 ENV *env; 1181 LOG *lp; 1182 size_t nw; 1183 int ret; 1184 1185 env = dblp->env; 1186 lp = dblp->reginfo.primary; 1187 1188 DB_ASSERT(env, !lp->db_log_inmemory); 1189 1190 /* 1191 * If we haven't opened the log file yet or the current one has 1192 * changed, acquire a new log file. We are creating the file if we're 1193 * about to write to the start of it, in other words, if the write 1194 * offset is zero. 1195 */ 1196 if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file || 1197 dblp->lf_timestamp != lp->timestamp) 1198 if ((ret = __log_newfh(dblp, lp->w_off == 0)) != 0) 1199 return (ret); 1200 1201 /* 1202 * If we're writing the first block in a log file on a filesystem that 1203 * guarantees unwritten blocks are zero-filled, we set the size of the 1204 * file in advance. This increases sync performance on some systems, 1205 * because they don't need to update metadata on every sync. 1206 * 1207 * Ignore any error -- we may have run out of disk space, but that's no 1208 * reason to quit. 1209 */ 1210#ifdef HAVE_FILESYSTEM_NOTZERO 1211 if (lp->w_off == 0 && !__os_fs_notzero()) { 1212#else 1213 if (lp->w_off == 0) { 1214#endif 1215 (void)__db_file_extend(env, dblp->lfhp, lp->log_size); 1216 if (F_ISSET(dblp, DBLOG_ZERO)) 1217 (void)__db_zero_extend(env, dblp->lfhp, 1218 0, lp->log_size/lp->buffer_size, lp->buffer_size); 1219 1220 } 1221 1222 /* 1223 * Seek to the offset in the file (someone may have written it 1224 * since we last did). 1225 */ 1226 if ((ret = __os_io(env, DB_IO_WRITE, 1227 dblp->lfhp, 0, 0, lp->w_off, len, addr, &nw)) != 0) 1228 return (ret); 1229 1230 /* Reset the buffer offset and update the seek offset. */ 1231 lp->w_off += len; 1232 1233 /* Update written statistics. */ 1234 if ((lp->stat.st_wc_bytes += len) >= MEGABYTE) { 1235 lp->stat.st_wc_bytes -= MEGABYTE; 1236 ++lp->stat.st_wc_mbytes; 1237 } 1238#ifdef HAVE_STATISTICS 1239 if ((lp->stat.st_w_bytes += len) >= MEGABYTE) { 1240 lp->stat.st_w_bytes -= MEGABYTE; 1241 ++lp->stat.st_w_mbytes; 1242 } 1243 ++lp->stat.st_wcount; 1244#endif 1245 1246 return (0); 1247} 1248 1249/* 1250 * __log_file_pp -- 1251 * ENV->log_file pre/post processing. 1252 * 1253 * PUBLIC: int __log_file_pp __P((DB_ENV *, const DB_LSN *, char *, size_t)); 1254 */ 1255int 1256__log_file_pp(dbenv, lsn, namep, len) 1257 DB_ENV *dbenv; 1258 const DB_LSN *lsn; 1259 char *namep; 1260 size_t len; 1261{ 1262 DB_THREAD_INFO *ip; 1263 ENV *env; 1264 int ret, set; 1265 1266 env = dbenv->env; 1267 1268 ENV_REQUIRES_CONFIG(env, 1269 env->lg_handle, "DB_ENV->log_file", DB_INIT_LOG); 1270 1271 if ((ret = __log_get_config(dbenv, DB_LOG_IN_MEMORY, &set)) != 0) 1272 return (ret); 1273 if (set) { 1274 __db_errx(env, 1275 "DB_ENV->log_file is illegal with in-memory logs"); 1276 return (EINVAL); 1277 } 1278 1279 ENV_ENTER(env, ip); 1280 REPLICATION_WRAP(env, (__log_file(env, lsn, namep, len)), 0, ret); 1281 ENV_LEAVE(env, ip); 1282 return (ret); 1283} 1284 1285/* 1286 * __log_file -- 1287 * ENV->log_file. 1288 */ 1289static int 1290__log_file(env, lsn, namep, len) 1291 ENV *env; 1292 const DB_LSN *lsn; 1293 char *namep; 1294 size_t len; 1295{ 1296 DB_LOG *dblp; 1297 int ret; 1298 char *name; 1299 1300 dblp = env->lg_handle; 1301 LOG_SYSTEM_LOCK(env); 1302 ret = __log_name(dblp, lsn->file, &name, NULL, 0); 1303 LOG_SYSTEM_UNLOCK(env); 1304 if (ret != 0) 1305 return (ret); 1306 1307 /* Check to make sure there's enough room and copy the name. */ 1308 if (len < strlen(name) + 1) { 1309 *namep = '\0'; 1310 __db_errx(env, "DB_ENV->log_file: name buffer is too short"); 1311 return (EINVAL); 1312 } 1313 (void)strcpy(namep, name); 1314 __os_free(env, name); 1315 1316 return (0); 1317} 1318 1319/* 1320 * __log_newfh -- 1321 * Acquire a file handle for the current log file. 1322 */ 1323static int 1324__log_newfh(dblp, create) 1325 DB_LOG *dblp; 1326 int create; 1327{ 1328 ENV *env; 1329 LOG *lp; 1330 u_int32_t flags; 1331 int ret; 1332 logfile_validity status; 1333 1334 env = dblp->env; 1335 lp = dblp->reginfo.primary; 1336 1337 /* Close any previous file descriptor. */ 1338 if (dblp->lfhp != NULL) { 1339 (void)__os_closehandle(env, dblp->lfhp); 1340 dblp->lfhp = NULL; 1341 } 1342 1343 flags = DB_OSO_SEQ | 1344 (create ? DB_OSO_CREATE : 0) | 1345 (F_ISSET(dblp, DBLOG_DIRECT) ? DB_OSO_DIRECT : 0) | 1346 (F_ISSET(dblp, DBLOG_DSYNC) ? DB_OSO_DSYNC : 0); 1347 1348 /* Get the path of the new file and open it. */ 1349 dblp->lfname = lp->lsn.file; 1350 if ((ret = __log_valid(dblp, dblp->lfname, 0, &dblp->lfhp, 1351 flags, &status, NULL)) != 0) 1352 __db_err(env, ret, 1353 "DB_ENV->log_newfh: %lu", (u_long)lp->lsn.file); 1354 else if (status != DB_LV_NORMAL && status != DB_LV_INCOMPLETE && 1355 status != DB_LV_OLD_READABLE) 1356 ret = DB_NOTFOUND; 1357 1358 return (ret); 1359} 1360 1361/* 1362 * __log_name -- 1363 * Return the log name for a particular file, and optionally open it. 1364 * 1365 * PUBLIC: int __log_name __P((DB_LOG *, 1366 * PUBLIC: u_int32_t, char **, DB_FH **, u_int32_t)); 1367 */ 1368int 1369__log_name(dblp, filenumber, namep, fhpp, flags) 1370 DB_LOG *dblp; 1371 u_int32_t filenumber, flags; 1372 char **namep; 1373 DB_FH **fhpp; 1374{ 1375 ENV *env; 1376 LOG *lp; 1377 int mode, ret; 1378 char *oname; 1379 char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20]; 1380 1381 env = dblp->env; 1382 lp = dblp->reginfo.primary; 1383 1384 DB_ASSERT(env, !lp->db_log_inmemory); 1385 1386 /* 1387 * !!! 1388 * The semantics of this routine are bizarre. 1389 * 1390 * The reason for all of this is that we need a place where we can 1391 * intercept requests for log files, and, if appropriate, check for 1392 * both the old-style and new-style log file names. The trick is 1393 * that all callers of this routine that are opening the log file 1394 * read-only want to use an old-style file name if they can't find 1395 * a match using a new-style name. The only down-side is that some 1396 * callers may check for the old-style when they really don't need 1397 * to, but that shouldn't mess up anything, and we only check for 1398 * the old-style name when we've already failed to find a new-style 1399 * one. 1400 * 1401 * Create a new-style file name, and if we're not going to open the 1402 * file, return regardless. 1403 */ 1404 (void)snprintf(new, sizeof(new), LFNAME, filenumber); 1405 if ((ret = __db_appname(env, 1406 DB_APP_LOG, new, 0, NULL, namep)) != 0 || fhpp == NULL) 1407 return (ret); 1408 1409 /* The application may have specified an absolute file mode. */ 1410 if (lp->filemode == 0) 1411 mode = env->db_mode; 1412 else { 1413 LF_SET(DB_OSO_ABSMODE); 1414 mode = lp->filemode; 1415 } 1416 1417 /* Open the new-style file -- if we succeed, we're done. */ 1418 dblp->lf_timestamp = lp->timestamp; 1419 if ((ret = __os_open(env, *namep, 0, flags, mode, fhpp)) == 0) 1420 return (0); 1421 1422 /* 1423 * If the open failed for reason other than the file 1424 * not being there, complain loudly, the wrong user 1425 * probably started up the application. 1426 */ 1427 if (ret != ENOENT) { 1428 __db_err(env, ret, "%s: log file unreadable", *namep); 1429 return (__env_panic(env, ret)); 1430 } 1431 1432 /* 1433 * The open failed... if the DB_RDONLY flag isn't set, we're done, 1434 * the caller isn't interested in old-style files. 1435 */ 1436 if (!LF_ISSET(DB_OSO_RDONLY)) { 1437 __db_err(env, ret, "%s: log file open failed", *namep); 1438 return (__env_panic(env, ret)); 1439 } 1440 1441 /* Create an old-style file name. */ 1442 (void)snprintf(old, sizeof(old), LFNAME_V1, filenumber); 1443 if ((ret = __db_appname(env, DB_APP_LOG, old, 0, NULL, &oname)) != 0) 1444 goto err; 1445 1446 /* 1447 * Open the old-style file -- if we succeed, we're done. Free the 1448 * space allocated for the new-style name and return the old-style 1449 * name to the caller. 1450 */ 1451 if ((ret = __os_open(env, oname, 0, flags, mode, fhpp)) == 0) { 1452 __os_free(env, *namep); 1453 *namep = oname; 1454 return (0); 1455 } 1456 1457 /* 1458 * Couldn't find either style of name -- return the new-style name 1459 * for the caller's error message. If it's an old-style name that's 1460 * actually missing we're going to confuse the user with the error 1461 * message, but that implies that not only were we looking for an 1462 * old-style name, but we expected it to exist and we weren't just 1463 * looking for any log file. That's not a likely error. 1464 */ 1465err: __os_free(env, oname); 1466 return (ret); 1467} 1468 1469/* 1470 * __log_rep_put -- 1471 * Short-circuit way for replication clients to put records into the 1472 * log. Replication clients' logs need to be laid out exactly as their masters' 1473 * are, so we let replication take responsibility for when the log gets 1474 * flushed, when log switches files, etc. This is just a thin PUBLIC wrapper 1475 * for __log_putr with a slightly prettier interface. 1476 * 1477 * Note that the REP->mtx_clientdb should be held when this is called. 1478 * Note that we acquire the log region mutex while holding mtx_clientdb. 1479 * 1480 * PUBLIC: int __log_rep_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t)); 1481 */ 1482int 1483__log_rep_put(env, lsnp, rec, flags) 1484 ENV *env; 1485 DB_LSN *lsnp; 1486 const DBT *rec; 1487 u_int32_t flags; 1488{ 1489 DBT *dbt, t; 1490 DB_CIPHER *db_cipher; 1491 DB_LOG *dblp; 1492 HDR hdr; 1493 LOG *lp; 1494 int need_free, ret; 1495 1496 dblp = env->lg_handle; 1497 lp = dblp->reginfo.primary; 1498 1499 LOG_SYSTEM_LOCK(env); 1500 memset(&hdr, 0, sizeof(HDR)); 1501 t = *rec; 1502 dbt = &t; 1503 need_free = 0; 1504 db_cipher = env->crypto_handle; 1505 if (CRYPTO_ON(env)) 1506 t.size += db_cipher->adj_size(rec->size); 1507 if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0) 1508 goto err; 1509 need_free = 1; 1510 memcpy(t.data, rec->data, rec->size); 1511 1512 if ((ret = __log_encrypt_record(env, dbt, &hdr, rec->size)) != 0) 1513 goto err; 1514 __db_chksum(&hdr, t.data, t.size, 1515 (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL, hdr.chksum); 1516 1517 DB_ASSERT(env, LOG_COMPARE(lsnp, &lp->lsn) == 0); 1518 ret = __log_putr(dblp, lsnp, dbt, lp->lsn.offset - lp->len, &hdr); 1519err: 1520 /* 1521 * !!! Assume caller holds REP->mtx_clientdb to modify ready_lsn. 1522 */ 1523 lp->ready_lsn = lp->lsn; 1524 1525 if (LF_ISSET(DB_LOG_CHKPNT)) 1526 lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0; 1527 1528 /* Increment count of records added to the log. */ 1529 STAT(++lp->stat.st_record); 1530 LOG_SYSTEM_UNLOCK(env); 1531 if (need_free) 1532 __os_free(env, t.data); 1533 return (ret); 1534} 1535 1536static int 1537__log_encrypt_record(env, dbt, hdr, orig) 1538 ENV *env; 1539 DBT *dbt; 1540 HDR *hdr; 1541 u_int32_t orig; 1542{ 1543 DB_CIPHER *db_cipher; 1544 int ret; 1545 1546 if (CRYPTO_ON(env)) { 1547 db_cipher = env->crypto_handle; 1548 hdr->size = HDR_CRYPTO_SZ; 1549 hdr->orig_size = orig; 1550 if ((ret = db_cipher->encrypt(env, db_cipher->data, 1551 hdr->iv, dbt->data, dbt->size)) != 0) 1552 return (ret); 1553 } else { 1554 hdr->size = HDR_NORMAL_SZ; 1555 } 1556 return (0); 1557} 1558