1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2001,2008 Oracle. All rights reserved. 5 * 6 * $Id: fop_util.c,v 12.65 2008/05/07 12:27:34 bschmeck Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/db_page.h" 13#include "dbinc/db_am.h" 14#include "dbinc/hash.h" 15#include "dbinc/fop.h" 16#include "dbinc/lock.h" 17#include "dbinc/mp.h" 18#include "dbinc/log.h" 19#include "dbinc/txn.h" 20 21static int __fop_set_pgsize __P((DB *, DB_FH *, const char *)); 22static int __fop_inmem_create __P((DB *, const char *, DB_TXN *, u_int32_t)); 23static int __fop_inmem_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *)); 24static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t)); 25static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *, 26 const char *, const char *, const char *, DB_LOCKER *)); 27static int __fop_ondisk_dummy __P((DB *, 28 DB_TXN *, const char *, u_int8_t *, u_int32_t)); 29static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *, 30 const char *, const char *, const char *, DB_LOCKER *, u_int32_t)); 31 32/* 33 * Acquire the environment meta-data lock. The parameters are the 34 * environment (ENV), the locker id to use in acquiring the lock (ID) 35 * and a pointer to a DB_LOCK. 36 * 37 * !!! 38 * Turn off locking for Critical Path. The application must do its own 39 * synchronization of open/create. Two threads creating and opening a 40 * file at the same time may have unpredictable results. 41 */ 42#ifdef CRITICALPATH_10266 43#define GET_ENVLOCK(ENV, ID, L) (0) 44#else 45#define GET_ENVLOCK(ENV, ID, L) do { \ 46 DBT __dbt; \ 47 u_int32_t __lockval; \ 48 \ 49 if (LOCKING_ON((ENV))) { \ 50 __lockval = 1; \ 51 __dbt.data = &__lockval; \ 52 __dbt.size = sizeof(__lockval); \ 53 if ((ret = __lock_get((ENV), (ID), \ 54 0, &__dbt, DB_LOCK_WRITE, (L))) != 0) \ 55 goto err; \ 56 } \ 57} while (0) 58#endif 59 60#define RESET_MPF(D, F) do { \ 61 (void)__memp_fclose((D)->mpf, (F)); \ 62 (D)->mpf = NULL; \ 63 F_CLR((D), DB_AM_OPEN_CALLED); \ 64 if ((ret = __memp_fcreate((D)->env, &(D)->mpf)) != 0) \ 65 goto err; \ 66} while (0) 67 68/* 69 * If we open a file handle and our caller is doing fcntl(2) locking, 70 * we can't close the handle because that would discard the caller's 71 * lock. Save it until we close or refresh the DB handle. 72 */ 73#define CLOSE_HANDLE(D, F) { \ 74 if ((F) != NULL) { \ 75 if (LF_ISSET(DB_FCNTL_LOCKING)) \ 76 (D)->saved_open_fhp = (F); \ 77 else if ((t_ret = \ 78 __os_closehandle((D)->env, (F))) != 0) { \ 79 if (ret == 0) \ 80 ret = t_ret; \ 81 goto err; \ 82 } \ 83 (F) = NULL; \ 84 } \ 85} 86 87/* 88 * __fop_lock_handle -- 89 * 90 * Get the handle lock for a database. If the envlock is specified, do this 91 * as a lock_vec call that releases the environment lock before acquiring the 92 * handle lock. 93 * 94 * PUBLIC: int __fop_lock_handle __P((ENV *, 95 * PUBLIC: DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t)); 96 * 97 */ 98int 99__fop_lock_handle(env, dbp, locker, mode, elockp, flags) 100 ENV *env; 101 DB *dbp; 102 DB_LOCKER *locker; 103 db_lockmode_t mode; 104 DB_LOCK *elockp; 105 u_int32_t flags; 106{ 107 DBT fileobj; 108 DB_LOCKREQ reqs[2], *ereq; 109 DB_LOCK_ILOCK lock_desc; 110 int ret; 111 112 if (!LOCKING_ON(env) || 113 F_ISSET(dbp, DB_AM_COMPENSATE | DB_AM_RECOVER)) 114 return (0); 115 116 /* 117 * If we are in recovery, the only locking we should be 118 * doing is on the global environment. 119 */ 120 if (IS_RECOVERING(env)) 121 return (elockp == NULL ? 0 : __ENV_LPUT(env, *elockp)); 122 123 memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN); 124 lock_desc.pgno = dbp->meta_pgno; 125 lock_desc.type = DB_HANDLE_LOCK; 126 127 memset(&fileobj, 0, sizeof(fileobj)); 128 fileobj.data = &lock_desc; 129 fileobj.size = sizeof(lock_desc); 130 DB_TEST_SUBLOCKS(env, flags); 131 if (elockp == NULL) 132 ret = __lock_get(env, locker, 133 flags, &fileobj, mode, &dbp->handle_lock); 134 else { 135 reqs[0].op = DB_LOCK_PUT; 136 reqs[0].lock = *elockp; 137 reqs[1].op = DB_LOCK_GET; 138 reqs[1].mode = mode; 139 reqs[1].obj = &fileobj; 140 reqs[1].timeout = 0; 141 if ((ret = __lock_vec(env, 142 locker, flags, reqs, 2, &ereq)) == 0) { 143 dbp->handle_lock = reqs[1].lock; 144 LOCK_INIT(*elockp); 145 } else if (ereq != reqs) 146 LOCK_INIT(*elockp); 147 } 148 149 dbp->cur_locker = locker; 150 return (ret); 151} 152 153/* 154 * __fop_file_setup -- 155 * 156 * Perform all the needed checking and locking to open up or create a 157 * file. 158 * 159 * There's a reason we don't push this code down into the buffer cache. 160 * The problem is that there's no information external to the file that 161 * we can use as a unique ID. UNIX has dev/inode pairs, but they are 162 * not necessarily unique after reboot, if the file was mounted via NFS. 163 * Windows has similar problems, as the FAT filesystem doesn't maintain 164 * dev/inode numbers across reboot. So, we must get something from the 165 * file we can use to ensure that, even after a reboot, the file we're 166 * joining in the cache is the right file for us to join. The solution 167 * we use is to maintain a file ID that's stored in the database, and 168 * that's why we have to open and read the file before calling into the 169 * buffer cache or obtaining a lock (we use this unique fileid to lock 170 * as well as to identify like files in the cache). 171 * 172 * There are a couple of idiosyncrasies that this code must support, in 173 * particular, DB_TRUNCATE and DB_FCNTL_LOCKING. First, we disallow 174 * DB_TRUNCATE in the presence of transactions, since opening a file with 175 * O_TRUNC will result in data being lost in an unrecoverable fashion. 176 * We also disallow DB_TRUNCATE if locking is enabled, because even in 177 * the presence of locking, we cannot avoid race conditions, so allowing 178 * DB_TRUNCATE with locking would be misleading. See SR [#7345] for more 179 * details. 180 * 181 * However, if you are running with neither locking nor transactions, then 182 * you can specify DB_TRUNCATE, and if you do so, we will truncate the file 183 * regardless of its contents. 184 * 185 * FCNTL locking introduces another set of complications. First, the only 186 * reason we support the DB_FCNTL_LOCKING flag is for historic compatibility 187 * with programs like Sendmail and Postfix. In these cases, the caller may 188 * already have a lock on the file; we need to make sure that any file handles 189 * we open remain open, because if we were to close them, the lock held by the 190 * caller would go away. Furthermore, Sendmail and/or Postfix need the ability 191 * to create databases in empty files. So, when you're doing FCNTL locking, 192 * it's reasonable that you are trying to create a database into a 0-length 193 * file and we allow it, while under normal conditions, we do not create 194 * databases if the files already exist and are not Berkeley DB files. 195 * 196 * PUBLIC: int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip, 197 * PUBLIC: DB_TXN *, const char *, int, u_int32_t, u_int32_t *)); 198 */ 199int 200__fop_file_setup(dbp, ip, txn, name, mode, flags, retidp) 201 DB *dbp; 202 DB_THREAD_INFO *ip; 203 DB_TXN *txn; 204 const char *name; 205 int mode; 206 u_int32_t flags, *retidp; 207{ 208 DBTYPE save_type; 209 DB_FH *fhp; 210 DB_LOCK elock; 211 DB_LOCKER *locker; 212 DB_TXN *stxn; 213 ENV *env; 214 size_t len; 215 u_int32_t dflags, oflags; 216 u_int8_t mbuf[DBMETASIZE]; 217 int created_locker, create_ok, ret, retries, t_ret, tmp_created; 218 int truncating, was_inval; 219 char *real_name, *real_tmpname, *tmpname; 220 221 *retidp = TXN_INVALID; 222 223 env = dbp->env; 224 fhp = NULL; 225 LOCK_INIT(elock); 226 stxn = NULL; 227 created_locker = tmp_created = truncating = was_inval = 0; 228 real_name = real_tmpname = tmpname = NULL; 229 dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0; 230 231 ret = 0; 232 retries = 0; 233 save_type = dbp->type; 234 235 /* 236 * Get a lockerid for this handle. There are paths through queue 237 * rename and remove where this dbp already has a locker, so make 238 * sure we don't clobber it and conflict. 239 */ 240 if (LOCKING_ON(env) && 241 !F_ISSET(dbp, DB_AM_COMPENSATE) && 242 !F_ISSET(dbp, DB_AM_RECOVER) && 243 dbp->locker == DB_LOCK_INVALIDID) { 244 if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0) 245 goto err; 246 created_locker = 1; 247 } 248 LOCK_INIT(dbp->handle_lock); 249 250 locker = txn == NULL ? dbp->locker : txn->locker; 251 252 oflags = 0; 253 if (F_ISSET(dbp, DB_AM_INMEM)) 254 real_name = (char *)name; 255 else { 256 /* Get the real backing file name. */ 257 if ((ret = __db_appname(env, 258 DB_APP_DATA, name, 0, NULL, &real_name)) != 0) 259 goto err; 260 261 /* Fill in the default file mode. */ 262 if (mode == 0) 263 mode = DB_MODE_660; 264 265 if (LF_ISSET(DB_RDONLY)) 266 oflags |= DB_OSO_RDONLY; 267 if (LF_ISSET(DB_TRUNCATE)) 268 oflags |= DB_OSO_TRUNC; 269 } 270 271 retries = 0; 272 create_ok = LF_ISSET(DB_CREATE); 273 LF_CLR(DB_CREATE); 274 275retry: 276 /* 277 * If we cannot create the file, only retry a few times. We 278 * think we might be in a race with another create, but it could 279 * be that the backup filename exists (that is, is left over from 280 * a previous crash). 281 */ 282 if (++retries > DB_RETRY) { 283 __db_errx(env, "__fop_file_setup: Retry limit (%d) exceeded", 284 DB_RETRY); 285 goto err; 286 } 287 if (!F_ISSET(dbp, DB_AM_COMPENSATE) && !F_ISSET(dbp, DB_AM_RECOVER)) 288 GET_ENVLOCK(env, locker, &elock); 289 if (name == NULL) 290 ret = ENOENT; 291 else if (F_ISSET(dbp, DB_AM_INMEM)) { 292 ret = __env_mpool(dbp, name, flags); 293 /* 294 * We are using __env_open as a check for existence. 295 * However, __env_mpool does an actual open and there 296 * are scenarios where the object exists, but cannot be 297 * opened, because our settings don't match those internally. 298 * We need to check for that explicitly. We'll need the 299 * mpool open to read the meta-data page, so we're going to 300 * have to temporarily turn this dbp into an UNKNOWN one. 301 */ 302 if (ret == EINVAL) { 303 was_inval = 1; 304 save_type = dbp->type; 305 dbp->type = DB_UNKNOWN; 306 ret = __env_mpool(dbp, name, flags); 307 dbp->type = save_type; 308 } 309 } else 310 ret = __os_exists(env, real_name, NULL); 311 312 if (ret == 0) { 313 /* 314 * If the file exists, there are 5 possible cases: 315 * 1. DB_EXCL was specified so this is an error, unless 316 * this is a file left around after a rename and we 317 * are in the same transaction. This gets decomposed 318 * into several subcases, because we check for various 319 * errors before we know we're in rename. 320 * 2. We are truncating, and it doesn't matter what kind 321 * of file it is, we should open/create it. 322 * 3. It is 0-length, we are not doing transactions (i.e., 323 * we are sendmail), we should open/create into it. 324 * -- on-disk files only! 325 * 4. Is it a Berkeley DB file and we should simply open it. 326 * 5. It is not a BDB file and we should return an error. 327 */ 328 329 /* Open file (if there is one). */ 330reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = 331 __os_open(env, real_name, 0, oflags, 0, &fhp)) != 0) 332 goto err; 333 334 /* Case 2: DB_TRUNCATE: we must do the creation in place. */ 335 if (LF_ISSET(DB_TRUNCATE)) { 336 if (LF_ISSET(DB_EXCL)) { 337 /* Case 1a: DB_EXCL and DB_TRUNCATE. */ 338 ret = EEXIST; 339 goto err; 340 } 341 tmpname = (char *)name; 342 goto creat2; 343 } 344 345 /* Cases 1,3-5: we need to read the meta-data page. */ 346 if (F_ISSET(dbp, DB_AM_INMEM)) 347 ret = __fop_inmem_read_meta(dbp, txn, name, flags); 348 else { 349 ret = __fop_read_meta(env, real_name, mbuf, 350 sizeof(mbuf), fhp, 351 LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL ? 1 : 0, 352 &len); 353 354 /* Case 3: 0-length, no txns. */ 355 if (ret != 0 && len == 0 && txn == NULL) { 356 if (LF_ISSET(DB_EXCL)) { 357 /* 358 * Case 1b: DB_EXCL and 359 * 0-length file exists. 360 */ 361 ret = EEXIST; 362 goto err; 363 } 364 tmpname = (char *)name; 365 goto creat2; 366 } 367 368 /* Case 4: This is a valid file. */ 369 if (ret == 0) 370 ret = __db_meta_setup(env, dbp, 371 real_name, (DBMETA *)mbuf, flags, 1); 372 373 } 374 375 /* Case 5: Invalid file. */ 376 if (ret != 0) 377 goto err; 378 379 /* Now, get our handle lock. */ 380 if ((ret = __fop_lock_handle(env, 381 dbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) == 0) { 382 if ((ret = __ENV_LPUT(env, elock)) != 0) 383 goto err; 384 } else if (ret != DB_LOCK_NOTGRANTED || 385 (txn != NULL && F_ISSET(txn, TXN_NOWAIT))) 386 goto err; 387 else { 388 /* 389 * We were unable to acquire the handle lock without 390 * blocking. The fact that we are blocking might mean 391 * that someone else is trying to delete the file. 392 * Since some platforms cannot delete files while they 393 * are open (Windows), we are going to have to close 394 * the file. This would be a problem if we were doing 395 * FCNTL locking, because our closing the handle would 396 * release the FCNTL locks. Fortunately, if we are 397 * doing FCNTL locking, then we should never fail to 398 * acquire our handle lock, so we should never get here. 399 * We assert it here to make sure we aren't destroying 400 * any application level FCNTL semantics. 401 */ 402 DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING)); 403 if (!F_ISSET(dbp, DB_AM_INMEM)) { 404 if ((ret = __os_closehandle(env, fhp)) != 0) 405 goto err; 406 fhp = NULL; 407 } 408 if ((ret = __fop_lock_handle(env, 409 dbp, locker, DB_LOCK_READ, &elock, 0)) != 0) { 410 if (F_ISSET(dbp, DB_AM_INMEM)) 411 RESET_MPF(dbp, 0); 412 goto err; 413 } 414 415 /* 416 * It's possible that our DBP was initialized 417 * with a different file last time we opened it. 418 * Therefore, we need to reset the DBP type and then 419 * re-read the meta-data page and reset any other 420 * fields that __db_meta_setup initializes. We 421 * need to shut down this dbp and reopen for in-memory 422 * named databases. Unfortunately __db_refresh is 423 * pretty aggressive at the shutting down, so we need 424 * to do a bunch of restoration. 425 * XXX it would be nice to pull refresh apart into 426 * the stuff you need to do to call __env_mpool 427 * and the stuff you can really throw away. 428 */ 429 if (F_ISSET(dbp, DB_AM_INMEM)) { 430 if ((ret = __db_refresh(dbp, 431 txn, DB_NOSYNC, NULL, 1)) != 0) 432 goto err; 433 ret = __env_mpool(dbp, name, flags); 434 } else 435 ret = 436 __os_open(env, real_name, 0, 0, 0, &fhp); 437 438 if (ret != 0) { 439 if ((ret = 440 __ENV_LPUT(env, dbp->handle_lock)) != 0) { 441 LOCK_INIT(dbp->handle_lock); 442 goto err; 443 } 444 goto retry; 445 } 446 447 dbp->type = save_type; 448 if (F_ISSET(dbp, DB_AM_INMEM)) 449 ret = __fop_inmem_read_meta(dbp, 450 txn, name, flags); 451 else if ((ret = 452 __fop_read_meta(env, real_name, mbuf, 453 sizeof(mbuf), fhp, 454 LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL ? 1 : 0, 455 &len)) != 0 || 456 (ret = __db_meta_setup(env, dbp, real_name, 457 (DBMETA *)mbuf, flags, DB_CHK_META)) != 0) 458 goto err; 459 460 } 461 462 /* If we got here, then we have the handle lock. */ 463 464 /* 465 * Check for a file in the midst of a rename. If we find that 466 * the file is in the midst of a rename, it must be the case 467 * that it is in our current transaction (else we would still 468 * be blocking), so we can continue along and create a new file 469 * with the same name. In that case, we have to close the file 470 * handle because we reuse it below. This is a case where 471 * a 'was_inval' above is OK. 472 */ 473 if (F_ISSET(dbp, DB_AM_IN_RENAME)) { 474 was_inval = 0; 475 if (create_ok) { 476 if (F_ISSET(dbp, DB_AM_INMEM)) { 477 RESET_MPF(dbp, DB_MPOOL_DISCARD); 478 } else if ((ret = 479 __os_closehandle(env, fhp)) != 0) 480 goto err; 481 LF_SET(DB_CREATE); 482 goto create; 483 } else { 484 ret = ENOENT; 485 goto err; 486 } 487 } 488 489 /* If we get here, a was_inval is bad. */ 490 if (was_inval) { 491 ret = EINVAL; 492 goto err; 493 } 494 495 /* 496 * Now, case 1: check for DB_EXCL, because the file that exists 497 * is not in the middle of a rename, so we have an error. This 498 * is a weird case, but we need to make sure that we don't 499 * continue to hold the handle lock, since technically, we 500 * should not have been allowed to open it. 501 */ 502 if (LF_ISSET(DB_EXCL)) { 503 ret = __ENV_LPUT(env, dbp->handle_lock); 504 LOCK_INIT(dbp->handle_lock); 505 if (ret == 0) 506 ret = EEXIST; 507 goto err; 508 } 509 goto done; 510 } 511 512 /* File does not exist. */ 513#ifdef HAVE_VXWORKS 514 /* 515 * VxWorks can return file-system specific error codes if the 516 * file does not exist, not ENOENT. 517 */ 518 if (!create_ok) 519#else 520 if (!create_ok || ret != ENOENT) 521#endif 522 goto err; 523 LF_SET(DB_CREATE); 524 ret = 0; 525 526 /* 527 * We need to create file, which means that we need to set up the file, 528 * the fileid and the locks. Then we need to call the appropriate 529 * routines to create meta-data pages. For in-memory files, we retain 530 * the environment lock, while for on-disk files, we drop the env lock 531 * and create into a temporary. 532 */ 533 if (!F_ISSET(dbp, DB_AM_INMEM) && 534 (ret = __ENV_LPUT(env, elock)) != 0) 535 goto err; 536 537create: if (txn != NULL && IS_REP_CLIENT(env) && 538 !F_ISSET(dbp, DB_AM_NOT_DURABLE)) { 539 __db_errx(env, 540 "Transactional create on replication client disallowed"); 541 ret = EINVAL; 542 goto err; 543 } 544 545 if (F_ISSET(dbp, DB_AM_INMEM)) 546 ret = __fop_inmem_create(dbp, name, txn, flags); 547 else { 548 if ((ret = __db_backup_name(env, name, txn, &tmpname)) != 0) 549 goto err; 550 if (TXN_ON(env) && txn != NULL && 551 (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0) 552 goto err; 553 if ((ret = __fop_create(env, 554 stxn, &fhp, tmpname, DB_APP_DATA, mode, dflags)) != 0) { 555 /* 556 * If no transactions, there is a race on creating the 557 * backup file, as the backup file name is the same for 558 * all processes. Wait for the other process to finish 559 * with the name. 560 */ 561 if (!TXN_ON(env) && ret == EEXIST) { 562 __os_free(env, tmpname); 563 tmpname = NULL; 564 __os_yield(env, 1, 0); 565 goto retry; 566 } 567 goto err; 568 } 569 tmp_created = 1; 570 } 571 572creat2: if (!F_ISSET(dbp, DB_AM_INMEM)) { 573 if ((ret = __db_appname(env, 574 DB_APP_DATA, tmpname, 0, NULL, &real_tmpname)) != 0) 575 goto err; 576 577 /* Set the pagesize if it isn't yet set. */ 578 if (dbp->pgsize == 0 && 579 (ret = __fop_set_pgsize(dbp, fhp, real_tmpname)) != 0) 580 goto errmsg; 581 582 /* Construct a file_id. */ 583 if ((ret = 584 __os_fileid(env, real_tmpname, 1, dbp->fileid)) != 0) 585 goto errmsg; 586 } 587 588 if ((ret = __db_new_file(dbp, ip, 589 F_ISSET(dbp, DB_AM_INMEM) ? txn : stxn, fhp, tmpname)) != 0) 590 goto err; 591 592 /* 593 * We need to close the handle here on platforms where remove and 594 * rename fail if a handle is open (including Windows). 595 */ 596 CLOSE_HANDLE(dbp, fhp); 597 598 /* 599 * Now move the file into place unless we are creating in place (because 600 * we created a database in a file that started out 0-length). If 601 * this is an in-memory file, we may or may not hold the environment 602 * lock depending on how we got here. 603 */ 604 if (!F_ISSET(dbp, DB_AM_COMPENSATE) && 605 !F_ISSET(dbp, DB_AM_RECOVER) && !LOCK_ISSET(elock)) 606 GET_ENVLOCK(env, locker, &elock); 607 608 if (F_ISSET(dbp, DB_AM_IN_RENAME)) { 609 F_CLR(dbp, DB_AM_IN_RENAME); 610 __txn_remrem(env, txn, real_name); 611 } else if (name == tmpname) { 612 /* We created it in place. */ 613 } else if (!F_ISSET(dbp, DB_AM_INMEM) && 614 __os_exists(env, real_name, NULL) == 0) { 615 /* 616 * Someone managed to create the file; remove our temp 617 * and try to open the file that now exists. 618 */ 619 (void)__fop_remove(env, 620 NULL, dbp->fileid, tmpname, DB_APP_DATA, dflags); 621 (void)__ENV_LPUT(env, dbp->handle_lock); 622 LOCK_INIT(dbp->handle_lock); 623 624 if (stxn != NULL) { 625 ret = __txn_abort(stxn); 626 stxn = NULL; 627 } 628 if (ret != 0) 629 goto err; 630 goto reopen; 631 } 632 633 if (name != NULL && (ret = __fop_lock_handle(env, 634 dbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0) 635 goto err; 636 if (tmpname != NULL && tmpname != name && (ret = __fop_rename(env, 637 stxn, tmpname, name, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0) 638 goto err; 639 640 if (stxn != NULL) { 641 *retidp = stxn->txnid; 642 ret = __txn_commit(stxn, 0); 643 stxn = NULL; 644 } else 645 *retidp = TXN_INVALID; 646 647 if (ret != 0) 648 goto err; 649 650 F_SET(dbp, DB_AM_CREATED); 651 652 if (0) { 653errmsg: __db_err(env, ret, "%s", name); 654 655err: CLOSE_HANDLE(dbp, fhp); 656 if (stxn != NULL) 657 (void)__txn_abort(stxn); 658 if (tmp_created && txn == NULL) 659 (void)__fop_remove(env, 660 NULL, NULL, tmpname, DB_APP_DATA, dflags); 661 if (txn == NULL) 662 (void)__ENV_LPUT(env, dbp->handle_lock); 663 (void)__ENV_LPUT(env, elock); 664 if (created_locker) { 665 (void)__lock_id_free(env, dbp->locker); 666 dbp->locker = NULL; 667 } 668 } 669 670done: /* 671 * There are cases where real_name and tmpname take on the 672 * exact same string, so we need to make sure that we do not 673 * free twice. 674 */ 675 if (!truncating && tmpname != NULL && tmpname != name) 676 __os_free(env, tmpname); 677 if (real_name != name && real_name != NULL) 678 __os_free(env, real_name); 679 if (real_tmpname != NULL) 680 __os_free(env, real_tmpname); 681 CLOSE_HANDLE(dbp, fhp); 682 683 return (ret); 684} 685 686/* 687 * __fop_set_pgsize -- 688 * Set the page size based on file information. 689 */ 690static int 691__fop_set_pgsize(dbp, fhp, name) 692 DB *dbp; 693 DB_FH *fhp; 694 const char *name; 695{ 696 ENV *env; 697 u_int32_t iopsize; 698 int ret; 699 700 env = dbp->env; 701 702 /* 703 * Use the filesystem's optimum I/O size as the pagesize if a pagesize 704 * not specified. Some filesystems have 64K as their optimum I/O size, 705 * but as that results in fairly large default caches, we limit the 706 * default pagesize to 16K. 707 */ 708 if ((ret = __os_ioinfo(env, name, fhp, NULL, NULL, &iopsize)) != 0) { 709 __db_err(env, ret, "%s", name); 710 return (ret); 711 } 712 if (iopsize < 512) 713 iopsize = 512; 714 if (iopsize > 16 * 1024) 715 iopsize = 16 * 1024; 716 717 /* 718 * Sheer paranoia, but we don't want anything that's not a power-of-2 719 * (we rely on that for alignment of various types on the pages), and 720 * we want a multiple of the sector size as well. If the value 721 * we got out of __os_ioinfo looks bad, use a default instead. 722 */ 723 if (!IS_VALID_PAGESIZE(iopsize)) 724 iopsize = DB_DEF_IOSIZE; 725 726 dbp->pgsize = iopsize; 727 F_SET(dbp, DB_AM_PGDEF); 728 729 return (0); 730} 731 732/* 733 * __fop_subdb_setup -- 734 * 735 * Subdb setup is significantly simpler than file setup. In terms of 736 * locking, for the duration of the operation/transaction, the locks on 737 * the meta-data page will suffice to protect us from simultaneous operations 738 * on the sub-database. Before we complete the operation though, we'll get a 739 * handle lock on the subdatabase so that on one else can try to remove it 740 * while we've got it open. We use an object that looks like the meta-data 741 * page lock with a different type (DB_HANDLE_LOCK) for the long-term handle. 742 * locks. 743 * 744 * PUBLIC: int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *, 745 * PUBLIC: const char *, const char *, int, u_int32_t)); 746 */ 747int 748__fop_subdb_setup(dbp, ip, txn, mname, name, mode, flags) 749 DB *dbp; 750 DB_THREAD_INFO *ip; 751 DB_TXN *txn; 752 const char *mname, *name; 753 int mode; 754 u_int32_t flags; 755{ 756 DB *mdbp; 757 ENV *env; 758 db_lockmode_t lkmode; 759 int ret, t_ret; 760 761 mdbp = NULL; 762 env = dbp->env; 763 764 if ((ret = __db_master_open(dbp, 765 ip, txn, mname, flags, mode, &mdbp)) != 0) 766 return (ret); 767 /* 768 * If we created this file, then we need to set the DISCARD flag so 769 * that if we fail in the middle of this routine, we discard from the 770 * mpool any pages that we just created. 771 */ 772 if (F_ISSET(mdbp, DB_AM_CREATED)) 773 F_SET(mdbp, DB_AM_DISCARD); 774 775 /* 776 * We are going to close this instance of the master, so we can 777 * steal its handle instead of reopening a handle on the database. 778 */ 779 if (LF_ISSET(DB_FCNTL_LOCKING)) { 780 dbp->saved_open_fhp = mdbp->saved_open_fhp; 781 mdbp->saved_open_fhp = NULL; 782 } 783 784 /* Copy the pagesize and set the sub-database flag. */ 785 dbp->pgsize = mdbp->pgsize; 786 F_SET(dbp, DB_AM_SUBDB); 787 788 if (name != NULL && (ret = __db_master_update(mdbp, dbp, 789 ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0) 790 goto err; 791 792 /* 793 * Hijack the master's locker ID as well, so that our locks don't 794 * conflict with the master's. Since we're closing the master, 795 * that locker would just have been freed anyway. Once we've gotten 796 * the locker id, we need to acquire the handle lock for this 797 * subdatabase. 798 */ 799 dbp->locker = mdbp->locker; 800 mdbp->locker = NULL; 801 802 DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname); 803 804 /* 805 * We copy our fileid from our master so that we all open 806 * the same file in mpool. We'll use the meta-pgno to lock 807 * so that we end up with different handle locks. 808 */ 809 810 memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN); 811 lkmode = F_ISSET(dbp, DB_AM_CREATED) || LF_ISSET(DB_WRITEOPEN) ? 812 DB_LOCK_WRITE : DB_LOCK_READ; 813 if ((ret = __fop_lock_handle(env, dbp, 814 txn == NULL ? dbp->locker : txn->locker, lkmode, NULL, 815 NOWAIT_FLAG(txn))) != 0) 816 goto err; 817 818 if ((ret = __db_init_subdb(mdbp, dbp, name, ip, txn)) != 0) { 819 /* 820 * If there was no transaction and we created this database, 821 * then we need to undo the update of the master database. 822 */ 823 if (F_ISSET(dbp, DB_AM_CREATED) && txn == NULL) 824 (void)__db_master_update(mdbp, dbp, 825 ip, txn, name, dbp->type, MU_REMOVE, NULL, 0); 826 F_CLR(dbp, DB_AM_CREATED); 827 goto err; 828 } 829 830 /* 831 * XXX 832 * This should have been done at the top of this routine. The problem 833 * is that __db_init_subdb() uses "standard" routines to process the 834 * meta-data page and set information in the DB handle based on it. 835 * Those routines have to deal with swapped pages and will normally set 836 * the DB_AM_SWAP flag. However, we use the master's metadata page and 837 * that has already been swapped, so they get the is-swapped test wrong. 838 */ 839 F_CLR(dbp, DB_AM_SWAP); 840 F_SET(dbp, F_ISSET(mdbp, DB_AM_SWAP)); 841 842 /* 843 * In the file create case, these happen in separate places so we have 844 * two different tests. They end up in the same place for subdbs, but 845 * for compatibility with file testing, we put them both here anyway. 846 */ 847 DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, mname); 848 DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, mname); 849 850 /* 851 * File exists and we have the appropriate locks; we should now 852 * process a normal open. 853 */ 854 if (F_ISSET(mdbp, DB_AM_CREATED)) { 855 F_SET(dbp, DB_AM_CREATED_MSTR); 856 F_CLR(mdbp, DB_AM_DISCARD); 857 } 858 859 if (0) { 860err: 861DB_TEST_RECOVERY_LABEL 862 if (txn == NULL) 863 (void)__ENV_LPUT(env, dbp->handle_lock); 864 } 865 866 /* 867 * The master's handle lock is under the control of the 868 * subdb (it acquired the master's locker). We want to 869 * keep the master's handle lock so that no one can remove 870 * the file while the subdb is open. If we register the 871 * trade event and then invalidate the copy of the lock 872 * in the master's handle, that will accomplish this. However, 873 * before we register this event, we'd better remove any 874 * events that we've already registered for the master. 875 */ 876 if (!F_ISSET(dbp, DB_AM_RECOVER) && IS_REAL_TXN(txn)) { 877 /* Unregister old master events. */ 878 __txn_remlock(env, 879 txn, &mdbp->handle_lock, DB_LOCK_INVALIDID); 880 881 /* Now register the new event. */ 882 if ((t_ret = __txn_lockevent(env, txn, dbp, 883 &mdbp->handle_lock, dbp->locker == NULL ? 884 mdbp->locker : dbp->locker)) != 0 && ret == 0) 885 ret = t_ret; 886 } 887 LOCK_INIT(mdbp->handle_lock); 888 889 /* 890 * If the master was created, we need to sync so that the metadata 891 * page is correct on disk for recovery, since it isn't read through 892 * mpool. If we're opening a subdb in an existing file, we can skip 893 * the sync. 894 */ 895 if (txn == NULL || F_ISSET(txn, TXN_CDSGROUP) || 896 F_ISSET(mdbp, DB_AM_RECOVER)) { 897 if ((t_ret = __db_close(mdbp, txn, 898 F_ISSET(dbp, DB_AM_CREATED_MSTR) ? 0 : DB_NOSYNC)) != 0 && 899 ret == 0) 900 ret = t_ret; 901 } else { 902 if (F_ISSET(dbp, DB_AM_CREATED_MSTR) && 903 (t_ret = __memp_fsync(mdbp->mpf)) != 0 && ret == 0) 904 ret = t_ret; 905 906 if ((t_ret = 907 __txn_closeevent(env, txn, mdbp)) != 0 && ret == 0) 908 ret = t_ret; 909 } 910 911 return (ret); 912} 913 914/* 915 * __fop_remove_setup -- 916 * Open handle appropriately and lock for removal of a database file. 917 * 918 * PUBLIC: int __fop_remove_setup __P((DB *, 919 * PUBLIC: DB_TXN *, const char *, u_int32_t)); 920 */ 921int 922__fop_remove_setup(dbp, txn, name, flags) 923 DB *dbp; 924 DB_TXN *txn; 925 const char *name; 926 u_int32_t flags; 927{ 928 DB_FH *fhp; 929 DB_LOCK elock; 930 ENV *env; 931 u_int8_t mbuf[DBMETASIZE]; 932 int ret; 933 934 COMPQUIET(flags, 0); 935 936 env = dbp->env; 937 938 LOCK_INIT(elock); 939 fhp = NULL; 940 ret = 0; 941 942 /* Create locker if necessary. */ 943retry: if (LOCKING_ON(env)) { 944 if (txn != NULL) 945 dbp->locker = txn->locker; 946 else if (dbp->locker == DB_LOCK_INVALIDID) { 947 if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0) 948 goto err; 949 } 950 } 951 952 /* 953 * We are about to open a file handle and then possibly close it. 954 * We cannot close handles if we are doing FCNTL locking. However, 955 * there is no way to pass the FCNTL flag into this routine via the 956 * user API. The only way we can get in here and be doing FCNTL 957 * locking is if we are trying to clean up an open that was called 958 * with FCNTL locking. In that case, the save_fhp should already be 959 * set. So, we use that field to tell us if we need to make sure 960 * that we shouldn't close the handle. 961 */ 962 fhp = dbp->saved_open_fhp; 963 DB_ASSERT(env, LF_ISSET(DB_FCNTL_LOCKING) || fhp == NULL); 964 965 /* 966 * Lock environment to protect file open. That will enable us to 967 * read the meta-data page and get the fileid so that we can lock 968 * the handle. 969 */ 970 GET_ENVLOCK(env, dbp->locker, &elock); 971 972 /* Open database. */ 973 if (F_ISSET(dbp, DB_AM_INMEM)) { 974 if ((ret = __env_mpool(dbp, name, flags)) == 0) 975 ret = __os_strdup(env, name, &dbp->dname); 976 } else if (fhp == NULL) 977 ret = __os_open(env, name, 0, DB_OSO_RDONLY, 0, &fhp); 978 if (ret != 0) 979 goto err; 980 981 /* Get meta-data */ 982 if (F_ISSET(dbp, DB_AM_INMEM)) 983 ret = __fop_inmem_read_meta(dbp, txn, name, flags); 984 else if ((ret = __fop_read_meta(env, 985 name, mbuf, sizeof(mbuf), fhp, 0, NULL)) == 0) 986 ret = __db_meta_setup(env, dbp, 987 name, (DBMETA *)mbuf, flags, DB_CHK_META | DB_CHK_NOLSN); 988 if (ret != 0) 989 goto err; 990 991 /* 992 * Now, get the handle lock. We first try with NOWAIT, because if 993 * we have to wait, we're going to have to close the file and reopen 994 * it, so that if there is someone else removing it, our open doesn't 995 * prevent that. 996 */ 997 if ((ret = __fop_lock_handle(env, 998 dbp, dbp->locker, DB_LOCK_WRITE, NULL, DB_LOCK_NOWAIT)) != 0) { 999 /* 1000 * Close the file, block on the lock, clean up the dbp, and 1001 * then start all over again. 1002 */ 1003 if (!F_ISSET(dbp, DB_AM_INMEM) && !LF_ISSET(DB_FCNTL_LOCKING)) { 1004 (void)__os_closehandle(env, fhp); 1005 fhp = NULL; 1006 } 1007 if (ret != DB_LOCK_NOTGRANTED || 1008 (txn != NULL && F_ISSET(txn, TXN_NOWAIT))) 1009 goto err; 1010 else if ((ret = __fop_lock_handle(env, 1011 dbp, dbp->locker, DB_LOCK_WRITE, &elock, 0)) != 0) 1012 goto err; 1013 1014 if (F_ISSET(dbp, DB_AM_INMEM)) { 1015 (void)__lock_put(env, &dbp->handle_lock); 1016 (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 1); 1017 } else { 1018 if (txn != NULL) 1019 dbp->locker = NULL; 1020 (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 0); 1021 } 1022 goto retry; 1023 } else if ((ret = __ENV_LPUT(env, elock)) != 0) 1024 goto err; 1025 else if (F_ISSET(dbp, DB_AM_IN_RENAME)) 1026 ret = ENOENT; 1027 1028 if (0) { 1029err: (void)__ENV_LPUT(env, elock); 1030 } 1031 if (fhp != NULL && !LF_ISSET(DB_FCNTL_LOCKING)) 1032 (void)__os_closehandle(env, fhp); 1033 /* 1034 * If this is a real file and we are going to proceed with the removal, 1035 * then we need to make sure that we don't leave any pages around in the 1036 * mpool since the file is closed and will be reopened again before 1037 * access. However, this might be an in-memory file, in which case 1038 * we will handle the discard from the mpool later as it's the "real" 1039 * removal of the database. 1040 */ 1041 if (ret == 0 && !F_ISSET(dbp, DB_AM_INMEM)) 1042 F_SET(dbp, DB_AM_DISCARD); 1043 return (ret); 1044} 1045 1046/* 1047 * __fop_read_meta -- 1048 * Read the meta-data page from a file and return it in buf. 1049 * 1050 * PUBLIC: int __fop_read_meta __P((ENV *, const char *, 1051 * PUBLIC: u_int8_t *, size_t, DB_FH *, int, size_t *)); 1052 */ 1053int 1054__fop_read_meta(env, name, buf, size, fhp, errok, nbytesp) 1055 ENV *env; 1056 const char *name; 1057 u_int8_t *buf; 1058 size_t size; 1059 DB_FH *fhp; 1060 int errok; 1061 size_t *nbytesp; 1062{ 1063 size_t nr; 1064 int ret; 1065 1066 /* 1067 * Our caller wants to know the number of bytes read, even if we 1068 * return an error. 1069 */ 1070 if (nbytesp != NULL) 1071 *nbytesp = 0; 1072 1073 nr = 0; 1074 ret = __os_read(env, fhp, buf, size, &nr); 1075 if (nbytesp != NULL) 1076 *nbytesp = nr; 1077 1078 if (ret != 0) { 1079 if (!errok) 1080 __db_err(env, ret, "%s", name); 1081 goto err; 1082 } 1083 1084 if (nr != size) { 1085 if (!errok) 1086 __db_errx(env, 1087 "%s: unexpected file type or format", name); 1088 ret = EINVAL; 1089 } 1090 1091err: 1092 return (ret); 1093} 1094 1095/* 1096 * __fop_dummy -- 1097 * This implements the creation and name swapping of dummy files that 1098 * we use for remove and rename (remove is simply a rename with a delayed 1099 * remove). 1100 * 1101 * PUBLIC: int __fop_dummy __P((DB *, 1102 * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t)); 1103 */ 1104int 1105__fop_dummy(dbp, txn, old, new, flags) 1106 DB *dbp; 1107 DB_TXN *txn; 1108 const char *old, *new; 1109 u_int32_t flags; 1110{ 1111 DB *tmpdbp; 1112 DB_TXN *stxn; 1113 ENV *env; 1114 char *back; 1115 int ret, t_ret; 1116 u_int8_t mbuf[DBMETASIZE]; 1117 1118 env = dbp->env; 1119 back = NULL; 1120 stxn = NULL; 1121 tmpdbp = NULL; 1122 1123 DB_ASSERT(env, txn != NULL); 1124 1125 /* 1126 * Begin sub transaction to encapsulate the rename. Note that we 1127 * expect the inmem_swap calls to complete the sub-transaction, 1128 * aborting on error and committing on success. 1129 */ 1130 if (TXN_ON(env) && 1131 (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0) 1132 goto err; 1133 1134 /* We need to create a dummy file as a place holder. */ 1135 if ((ret = __db_backup_name(env, new, stxn, &back)) != 0) 1136 goto err; 1137 /* Create a dummy dbp handle. */ 1138 if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0) 1139 goto err; 1140 1141 memset(mbuf, 0, sizeof(mbuf)); 1142 ret = F_ISSET(dbp, DB_AM_INMEM) ? 1143 __fop_inmem_dummy(tmpdbp, stxn, back, mbuf) : 1144 __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf, flags); 1145 1146 if (ret != 0) 1147 goto err; 1148 1149 ret = F_ISSET(dbp, DB_AM_INMEM) ? 1150 __fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) : 1151 __fop_ondisk_swap(dbp, 1152 tmpdbp, stxn, old, new, back, txn->locker, flags); 1153 stxn = NULL; 1154 if (ret != 0) 1155 goto err; 1156 1157err: if (stxn != NULL) 1158 (void)__txn_abort(stxn); 1159 if (tmpdbp != NULL && 1160 (t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0) 1161 ret = t_ret; 1162 if (back != NULL) 1163 __os_free(env, back); 1164 return (ret); 1165} 1166 1167/* 1168 * __fop_dbrename -- 1169 * Do the appropriate file locking and file system operations 1170 * to effect a dbrename in the absence of transactions (__fop_dummy 1171 * and the subsequent calls in __db_rename do the work for the 1172 * transactional case). 1173 * 1174 * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *)); 1175 */ 1176int 1177__fop_dbrename(dbp, old, new) 1178 DB *dbp; 1179 const char *old, *new; 1180{ 1181 DB_LOCK elock; 1182 ENV *env; 1183 char *real_new, *real_old; 1184 int ret, t_ret; 1185 1186 env = dbp->env; 1187 real_new = NULL; 1188 real_old = NULL; 1189 LOCK_INIT(elock); 1190 1191 if (F_ISSET(dbp, DB_AM_INMEM)) { 1192 real_new = (char *)new; 1193 real_old = (char *)old; 1194 } else { 1195 /* Get full names. */ 1196 if ((ret = __db_appname(env, 1197 DB_APP_DATA, new, 0, NULL, &real_new)) != 0) 1198 goto err; 1199 1200 if ((ret = __db_appname(env, 1201 DB_APP_DATA, old, 0, NULL, &real_old)) != 0) 1202 goto err; 1203 1204 } 1205 1206 /* 1207 * It is an error to rename a file over one that already exists, 1208 * as that wouldn't be transaction-safe. We check explicitly 1209 * for ondisk files, but it's done memp_nameop for in-memory ones. 1210 */ 1211 GET_ENVLOCK(env, dbp->locker, &elock); 1212 ret = F_ISSET(dbp, DB_AM_INMEM) ? ENOENT : 1213 __os_exists(env, real_new, NULL); 1214 1215 if (ret == 0) { 1216 ret = EEXIST; 1217 __db_errx(env, "rename: file %s exists", real_new); 1218 goto err; 1219 } 1220 1221 ret = __memp_nameop(env, 1222 dbp->fileid, new, real_old, real_new, F_ISSET(dbp, DB_AM_INMEM)); 1223 1224err: if ((t_ret = __ENV_LPUT(env, elock)) != 0 && ret == 0) 1225 ret = t_ret; 1226 if (!F_ISSET(dbp, DB_AM_INMEM) && real_old != NULL) 1227 __os_free(env, real_old); 1228 if (!F_ISSET(dbp, DB_AM_INMEM) && real_new != NULL) 1229 __os_free(env, real_new); 1230 return (ret); 1231} 1232 1233static int 1234__fop_inmem_create(dbp, name, txn, flags) 1235 DB *dbp; 1236 const char *name; 1237 DB_TXN *txn; 1238 u_int32_t flags; 1239{ 1240 DBT fid_dbt, name_dbt; 1241 DB_LSN lsn; 1242 ENV *env; 1243 int ret; 1244 int32_t lfid; 1245 u_int32_t *p32; 1246 1247 env = dbp->env; 1248 1249 MAKE_INMEM(dbp); 1250 1251 /* Set the pagesize if it isn't yet set. */ 1252 if (dbp->pgsize == 0) 1253 dbp->pgsize = DB_DEF_IOSIZE; 1254 1255 /* 1256 * Construct a file_id. 1257 * 1258 * If this file has no name, then we only need a fileid for locking. 1259 * If this file has a name, we need the fileid both for locking and 1260 * matching in the memory pool. So, with unnamed in-memory databases, 1261 * use a lock_id. For named in-memory files, we need to find a value 1262 * that we can use to uniquely identify a name/fid pair. We use a 1263 * combination of a unique id (__os_unique_id) and a hash of the 1264 * original name. 1265 */ 1266 if (name == NULL) { 1267 if (LOCKING_ON(env) && (ret = 1268 __lock_id(env, (u_int32_t *)dbp->fileid, NULL)) != 0) 1269 goto err; 1270 } else { 1271 p32 = (u_int32_t *)(&dbp->fileid[0]); 1272 __os_unique_id(env, p32); 1273 p32++; 1274 (void)strncpy( 1275 (char *)p32, name, DB_FILE_ID_LEN - sizeof(u_int32_t)); 1276 dbp->preserve_fid = 1; 1277 1278 if (DBENV_LOGGING(env) && 1279#if !defined(DEBUG_WOP) && !defined(DIAGNOSTIC) 1280 txn != NULL && 1281#endif 1282 dbp->log_filename != NULL) 1283 memcpy(dbp->log_filename->ufid, 1284 dbp->fileid, DB_FILE_ID_LEN); 1285 } 1286 1287 /* Now, set the fileid. */ 1288 if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0) 1289 goto err; 1290 1291 if ((ret = __env_mpool(dbp, name, flags)) != 0) 1292 goto err; 1293 1294 if (DBENV_LOGGING(env) && 1295#if !defined(DEBUG_WOP) 1296 txn != NULL && 1297#endif 1298 name != NULL) { 1299 DB_INIT_DBT(name_dbt, name, strlen(name) + 1); 1300 memset(&fid_dbt, 0, sizeof(fid_dbt)); 1301 fid_dbt.data = dbp->fileid; 1302 fid_dbt.size = DB_FILE_ID_LEN; 1303 lfid = dbp->log_filename == NULL ? 1304 DB_LOGFILEID_INVALID : dbp->log_filename->id; 1305 if ((ret = __crdel_inmem_create_log(env, txn, 1306 &lsn, 0, lfid, &name_dbt, &fid_dbt, dbp->pgsize)) != 0) 1307 goto err; 1308 } 1309 1310 F_SET(dbp, DB_AM_CREATED); 1311 1312err: 1313 return (ret); 1314} 1315 1316static int 1317__fop_inmem_read_meta(dbp, txn, name, flags) 1318 DB *dbp; 1319 DB_TXN *txn; 1320 const char *name; 1321 u_int32_t flags; 1322{ 1323 DBMETA *metap; 1324 DB_THREAD_INFO *ip; 1325 db_pgno_t pgno; 1326 int ret, t_ret; 1327 1328 if (txn == NULL) 1329 ENV_GET_THREAD_INFO(dbp->env, ip); 1330 else 1331 ip = txn->thread_info; 1332 1333 pgno = PGNO_BASE_MD; 1334 if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &metap)) != 0) 1335 return (ret); 1336 ret = __db_meta_setup(dbp->env, dbp, name, metap, flags, 1); 1337 1338 if ((t_ret = 1339 __memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0) 1340 ret = t_ret; 1341 1342 return (ret); 1343} 1344 1345static int 1346__fop_ondisk_dummy(dbp, txn, name, mbuf, flags) 1347 DB *dbp; 1348 DB_TXN *txn; 1349 const char *name; 1350 u_int8_t *mbuf; 1351 u_int32_t flags; 1352{ 1353 ENV *env; 1354 int ret; 1355 char *realname; 1356 u_int32_t dflags; 1357 1358 realname = NULL; 1359 env = dbp->env; 1360 dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0; 1361 1362 if ((ret = __db_appname(env, 1363 DB_APP_DATA, name, flags, NULL, &realname)) != 0) 1364 goto err; 1365 1366 if ((ret = __fop_create(env, 1367 txn, NULL, name, DB_APP_DATA, 0, dflags)) != 0) 1368 goto err; 1369 1370 if ((ret = 1371 __os_fileid(env, realname, 1, ((DBMETA *)mbuf)->uid)) != 0) 1372 goto err; 1373 1374 ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC; 1375 if ((ret = __fop_write(env, txn, name, 1376 DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0) 1377 goto err; 1378 1379 memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN); 1380 1381err: if (realname != NULL) 1382 __os_free(env, realname); 1383 1384 return (ret); 1385} 1386 1387static int 1388__fop_inmem_dummy(dbp, txn, name, mbuf) 1389 DB *dbp; 1390 DB_TXN *txn; 1391 const char *name; 1392 u_int8_t *mbuf; 1393{ 1394 DBMETA *metap; 1395 DB_THREAD_INFO *ip; 1396 db_pgno_t pgno; 1397 int ret, t_ret; 1398 1399 if ((ret = __fop_inmem_create(dbp, name, txn, DB_CREATE)) != 0) 1400 return (ret); 1401 if (txn == NULL) 1402 ENV_GET_THREAD_INFO(dbp->env, ip); 1403 else 1404 ip = txn->thread_info; 1405 1406 pgno = PGNO_BASE_MD; 1407 if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, 1408 DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &metap)) != 0) 1409 return (ret); 1410 /* Check file existed. */ 1411 if (metap->magic != 0) 1412 ret = EEXIST; 1413 else 1414 metap->magic = DB_RENAMEMAGIC; 1415 1416 /* Copy the fileid onto the meta-data page. */ 1417 memcpy(metap->uid, dbp->fileid, DB_FILE_ID_LEN); 1418 1419 if ((t_ret = __memp_fput(dbp->mpf, ip, metap, 1420 ret == 0 ? dbp->priority : DB_PRIORITY_VERY_LOW)) != 0 && ret == 0) 1421 ret = t_ret; 1422 1423 if (ret != 0) 1424 goto err; 1425 1426 ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC; 1427 1428err: return (ret); 1429} 1430 1431static int 1432__fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker, flags) 1433 DB *dbp, *tmpdbp; 1434 DB_TXN *txn; 1435 const char *old, *new, *back; 1436 DB_LOCKER *locker; 1437 u_int32_t flags; 1438{ 1439 DBT fiddbt, namedbt, tmpdbt; 1440 DB_FH *fhp; 1441 DB_LOCK elock; 1442 DB_LSN lsn; 1443 DB_TXN *parent; 1444 ENV *env; 1445 u_int8_t mbuf[DBMETASIZE]; 1446 u_int32_t child_txnid, dflags; 1447 int ret, t_ret; 1448 char *realold, *realnew; 1449 1450 env = dbp->env; 1451 DB_ASSERT(env, txn != NULL); 1452 DB_ASSERT(env, old != NULL); 1453 1454 realold = realnew = NULL; 1455 LOCK_INIT(elock); 1456 fhp = NULL; 1457 dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0; 1458 1459 if ((ret = 1460 __db_appname(env, DB_APP_DATA, new, 0, NULL, &realnew)) != 0) 1461 goto err; 1462 1463 /* Now, lock the name space while we initialize this file. */ 1464retry: GET_ENVLOCK(env, locker, &elock); 1465 if (__os_exists(env, realnew, NULL) == 0) { 1466 /* 1467 * It is possible that the only reason this file exists is 1468 * because we've done a previous rename of it and we have 1469 * left a placeholder here. We need to check for that case 1470 * and allow this rename to succeed if that's the case. 1471 */ 1472 if ((ret = __os_open(env, realnew, 0, 0, 0, &fhp)) != 0) 1473 goto err; 1474 if ((ret = __fop_read_meta(env, 1475 realnew, mbuf, sizeof(mbuf), fhp, 0, NULL)) != 0 || 1476 (ret = __db_meta_setup(env, 1477 tmpdbp, realnew, (DBMETA *)mbuf, 0, 1)) != 0) { 1478 ret = EEXIST; 1479 goto err; 1480 } 1481 ret = __os_closehandle(env, fhp); 1482 fhp = NULL; 1483 if (ret != 0) 1484 goto err; 1485 1486 /* 1487 * Now, try to acquire the handle lock. If the handle is locked 1488 * by our current, transaction, then we'll get it and life is 1489 * good. 1490 * 1491 * Alternately, it's not locked at all, we'll get the lock, but 1492 * we will realize it exists and consider this an error. 1493 * 1494 * However, if it's held by another transaction, then there 1495 * could be two different scenarios: 1) the file is in the 1496 * midst of being created or deleted and when that transaction 1497 * is over, we might be able to proceed. 2) the file is open 1498 * and exists and we should report an error. In order to 1499 * distinguish these two cases, we do the following. First, we 1500 * try to acquire a READLOCK. If the handle is in the midst of 1501 * being created, then we'll block because a writelock is held. 1502 * In that case, we should request a blocking write, and when we 1503 * get the lock, we should then go back and check to see if the 1504 * object exists and start all over again. 1505 * 1506 * If we got the READLOCK, then either no one is holding the 1507 * lock or someone has an open handle and the fact that the file 1508 * exists is problematic. So, in this case, we request the 1509 * WRITELOCK non-blocking -- if it succeeds, we're golden. If 1510 * it fails, then the file exists and we return EEXIST. 1511 */ 1512 if ((ret = __fop_lock_handle(env, 1513 tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) { 1514 /* 1515 * Someone holds a write-lock. Wait for the write-lock 1516 * and after we get it, release it and start over. 1517 */ 1518 if ((ret = __fop_lock_handle(env, tmpdbp, 1519 locker, DB_LOCK_WRITE, &elock, 0)) != 0) 1520 goto err; 1521 if ((ret = 1522 __lock_put(env, &tmpdbp->handle_lock)) != 0) 1523 goto err; 1524 if ((ret = __db_refresh(tmpdbp, NULL, 0, NULL, 0)) != 0) 1525 goto err; 1526 goto retry; 1527 } 1528 1529 /* We got the read lock; try to upgrade it. */ 1530 ret = __fop_lock_handle(env, 1531 tmpdbp, locker, DB_LOCK_WRITE, 1532 NULL, DB_LOCK_UPGRADE | DB_LOCK_NOWAIT); 1533 if (ret != 0) { 1534 /* 1535 * We did not get the writelock, so someone 1536 * has the handle open. This is an error. 1537 */ 1538 (void)__lock_put(env, &tmpdbp->handle_lock); 1539 ret = EEXIST; 1540 } else if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) 1541 /* We got the lock and are renaming it. */ 1542 ret = 0; 1543 else { /* We got the lock, but the file exists. */ 1544 (void)__lock_put(env, &tmpdbp->handle_lock); 1545 ret = EEXIST; 1546 } 1547 if (ret != 0) 1548 goto err; 1549 } 1550 1551 /* 1552 * While we have the namespace locked, do the renames and then 1553 * swap for the handle lock. 1554 */ 1555 if ((ret = __fop_rename(env, 1556 txn, old, new, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0) 1557 goto err; 1558 if ((ret = __fop_rename(env, 1559 txn, back, old, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0) 1560 goto err; 1561 if ((ret = __fop_lock_handle(env, 1562 tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0) 1563 goto err; 1564 1565 /* 1566 * We just acquired a transactional lock on the tmp handle. 1567 * We need to null out the tmp handle's lock so that it 1568 * doesn't create problems for us in the close path. 1569 */ 1570 LOCK_INIT(tmpdbp->handle_lock); 1571 1572 /* Commit the child. */ 1573 child_txnid = txn->txnid; 1574 parent = txn->parent; 1575 ret = __txn_commit(txn, 0); 1576 txn = NULL; 1577 1578 /* Now log the child information in the parent. */ 1579 memset(&fiddbt, 0, sizeof(fiddbt)); 1580 fiddbt.data = dbp->fileid; 1581 fiddbt.size = DB_FILE_ID_LEN; 1582 memset(&tmpdbt, 0, sizeof(fiddbt)); 1583 tmpdbt.data = tmpdbp->fileid; 1584 tmpdbt.size = DB_FILE_ID_LEN; 1585 DB_INIT_DBT(namedbt, old, strlen(old) + 1); 1586 if ((t_ret = __fop_file_remove_log(env, 1587 parent, &lsn, 0, &fiddbt, &tmpdbt, &namedbt, 1588 (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0) 1589 ret = t_ret; 1590 1591 /* This is a delayed delete of the dummy file. */ 1592 if ((ret = __db_appname(env, 1593 DB_APP_DATA, old, flags, NULL, &realold)) != 0) 1594 goto err; 1595 1596 if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0) 1597 goto err; 1598 1599err: if (txn != NULL) /* Ret must already be set, so void abort. */ 1600 (void)__txn_abort(txn); 1601 1602 (void)__ENV_LPUT(env, elock); 1603 1604 if (fhp != NULL && 1605 (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) 1606 ret = t_ret; 1607 1608 if (realnew != NULL) 1609 __os_free(env, realnew); 1610 if (realold != NULL) 1611 __os_free(env, realold); 1612 return (ret); 1613} 1614 1615static int 1616__fop_inmem_swap(olddbp, backdbp, txn, old, new, back, locker) 1617 DB *olddbp, *backdbp; 1618 DB_TXN *txn; 1619 const char *old, *new, *back; 1620 DB_LOCKER *locker; 1621{ 1622 DB *tmpdbp; 1623 DBT fid_dbt, n1_dbt, n2_dbt; 1624 DB_LOCK elock; 1625 DB_LSN lsn; 1626 DB_TXN *parent; 1627 ENV *env; 1628 int ret, t_ret; 1629 1630 env = olddbp->env; 1631 parent = txn->parent; 1632retry: LOCK_INIT(elock); 1633 if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0) 1634 return (ret); 1635 MAKE_INMEM(tmpdbp); 1636 1637 GET_ENVLOCK(env, locker, &elock); 1638 if ((ret = __env_mpool(tmpdbp, new, 0)) == 0) { 1639 /* 1640 * It is possible that the only reason this database exists is 1641 * because we've done a previous rename of it and we have 1642 * left a placeholder here. We need to check for that case 1643 * and allow this rename to succeed if that's the case. 1644 */ 1645 1646 if ((ret = __fop_inmem_read_meta(tmpdbp, txn, new, 0)) != 0) { 1647 ret = EEXIST; 1648 goto err; 1649 } 1650 1651 /* 1652 * Now, try to acquire the handle lock. If it's from our txn, 1653 * then we'll get the lock. If it's not, then someone else has 1654 * it locked. See the comments in __fop_ondisk_swap for 1655 * details. 1656 */ 1657 if ((ret = __fop_lock_handle(env, 1658 tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) { 1659 /* 1660 * Someone holds a writelock. Try for the WRITELOCK 1661 * and after we get it, retry. 1662 */ 1663 if ((ret = __fop_lock_handle(env, tmpdbp, 1664 locker, DB_LOCK_WRITE, &elock, 0)) != 0) 1665 goto err; 1666 1667 /* We have the write lock; release it and start over. */ 1668 (void)__lock_put(env, &tmpdbp->handle_lock); 1669 (void)__db_close(tmpdbp, NULL, DB_NOSYNC); 1670 (void)__ENV_LPUT(env, elock); 1671 goto retry; 1672 } else { 1673 (void)__lock_put(env, &tmpdbp->handle_lock); 1674 if (!F_ISSET(tmpdbp, DB_AM_IN_RENAME)) 1675 ret = EEXIST; 1676 } 1677 if (ret != 0) 1678 goto err; 1679 } 1680 1681 /* Log the renames. */ 1682 if (LOGGING_ON(env) 1683#ifndef DEBUG_WOP 1684 && txn != NULL 1685#endif 1686 ) { 1687 /* Rename old to new. */ 1688 DB_INIT_DBT(fid_dbt, olddbp->fileid, DB_FILE_ID_LEN); 1689 DB_INIT_DBT(n1_dbt, old, strlen(old) + 1); 1690 DB_INIT_DBT(n2_dbt, new, strlen(new) + 1); 1691 if ((ret = __crdel_inmem_rename_log( 1692 env, txn, &lsn, 0, &n1_dbt, &n2_dbt, &fid_dbt)) != 0) 1693 goto err; 1694 1695 /* Rename back to old */ 1696 fid_dbt.data = backdbp->fileid; 1697 DB_SET_DBT(n2_dbt, back, strlen(back) + 1); 1698 if ((ret = __crdel_inmem_rename_log( 1699 env, txn, &lsn, 0, &n2_dbt, &n1_dbt, &fid_dbt)) != 0) 1700 goto err; 1701 } 1702 1703 /* 1704 * While we have the namespace locked, do the renames and then 1705 * swap for the handle lock. If we ran into a file in the midst 1706 * of rename, then we need to delete it first, else nameop is 1707 * going to consider it an error. 1708 */ 1709 if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) { 1710 if ((ret = __memp_nameop(env, 1711 tmpdbp->fileid, NULL, new, NULL, 1)) != 0) 1712 goto err; 1713 __txn_remrem(env, parent, new); 1714 } 1715 1716 if ((ret = __memp_nameop( 1717 env, olddbp->fileid, new, old, new, 1)) != 0) 1718 goto err; 1719 if ((ret = __memp_nameop( 1720 env, backdbp->fileid, old, back, old, 1)) != 0) 1721 goto err; 1722 1723 if ((ret = __fop_lock_handle(env, 1724 tmpdbp, locker, DB_LOCK_WRITE, &elock, 0)) != 0) 1725 goto err; 1726 1727 /* 1728 * We just acquired a transactional lock on the tmp handle. 1729 * We need to null out the tmp handle's lock so that it 1730 * doesn't create problems for us in the close path. 1731 */ 1732 LOCK_INIT(tmpdbp->handle_lock); 1733 1734 DB_ASSERT(env, txn != NULL); 1735 1736 /* Commit the child. */ 1737 ret = __txn_commit(txn, 0); 1738 txn = NULL; 1739 1740 if ((ret = __db_inmem_remove(backdbp, parent, old)) != 0) 1741 goto err; 1742 1743err: (void)__ENV_LPUT(env, elock); 1744 1745 if (txn != NULL) 1746 (void)__txn_abort(txn); 1747 1748 if ((t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0) 1749 ret = t_ret; 1750 1751 return (ret); 1752} 1753