1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996-2009 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1995, 1996 8 * Margo Seltzer. All rights reserved. 9 */ 10/* 11 * Copyright (c) 1995, 1996 12 * The President and Fellows of Harvard University. All rights reserved. 13 * 14 * This code is derived from software contributed to Berkeley by 15 * Margo Seltzer. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. Neither the name of the University nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 * SUCH DAMAGE. 40 * 41 * $Id$ 42 */ 43 44#include "db_config.h" 45 46#include "db_int.h" 47#include "dbinc/db_page.h" 48#include "dbinc/btree.h" 49#include "dbinc/hash.h" 50#include "dbinc/log.h" 51#include "dbinc/mp.h" 52 53static int __ham_alloc_pages __P((DBC *, __ham_groupalloc_args *, DB_LSN *)); 54static int __ham_alloc_pages_42 55 __P((DBC *, __ham_groupalloc_42_args *, DB_LSN *)); 56 57/* 58 * __ham_insdel_recover -- 59 * 60 * PUBLIC: int __ham_insdel_recover 61 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 62 */ 63int 64__ham_insdel_recover(env, dbtp, lsnp, op, info) 65 ENV *env; 66 DBT *dbtp; 67 DB_LSN *lsnp; 68 db_recops op; 69 void *info; 70{ 71 __ham_insdel_args *argp; 72 DB_THREAD_INFO *ip; 73 DB *file_dbp; 74 DBC *dbc; 75 DB_MPOOLFILE *mpf; 76 PAGE *pagep; 77 db_indx_t dindx; 78 u_int32_t opcode; 79 int cmp_n, cmp_p, dtype, ktype, ret; 80 81 ip = ((DB_TXNHEAD *)info)->thread_info; 82 pagep = NULL; 83 REC_PRINT(__ham_insdel_print); 84 REC_INTRO(__ham_insdel_read, ip, 1); 85 86 if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 87 0, &pagep)) != 0) { 88 if (DB_UNDO(op)) { 89 if (ret == DB_PAGE_NOTFOUND) 90 goto done; 91 else { 92 ret = __db_pgerr(file_dbp, argp->pgno, ret); 93 goto out; 94 } 95 } 96 /* If the page is not here then it was later truncated. */ 97 if (!IS_ZERO_LSN(argp->pagelsn)) 98 goto done; 99 /* 100 * This page was created by a group allocation and 101 * the file may not have been extend yet. 102 * Create the page if necessary. 103 */ 104 if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 105 DB_MPOOL_CREATE, &pagep)) != 0) { 106 ret = __db_pgerr(file_dbp, argp->pgno, ret); 107 goto out; 108 } 109 } 110 111 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 112 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 113 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 114 115 /* 116 * Two possible things going on: 117 * redo a delete/undo a put: delete the item from the page. 118 * redo a put/undo a delete: add the item to the page. 119 * If we are undoing a delete, then the information logged is the 120 * entire entry off the page, not just the data of a dbt. In 121 * this case, we want to copy it back onto the page verbatim. 122 * We do this by calling __insertpair with the type H_OFFPAGE instead 123 * of H_KEYDATA. 124 */ 125 opcode = OPCODE_OF(argp->opcode); 126 if ((opcode == DELPAIR && cmp_n == 0 && DB_UNDO(op)) || 127 (opcode == PUTPAIR && cmp_p == 0 && DB_REDO(op))) { 128 /* 129 * Need to redo a PUT or undo a delete. 130 */ 131 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 132 ktype = DB_UNDO(op) || PAIR_ISKEYBIG(argp->opcode) ? 133 H_OFFPAGE : H_KEYDATA; 134 if (PAIR_ISDATADUP(argp->opcode)) 135 dtype = H_DUPLICATE; 136 else if (DB_UNDO(op) || PAIR_ISDATABIG(argp->opcode)) 137 dtype = H_OFFPAGE; 138 else 139 dtype = H_KEYDATA; 140 dindx = (db_indx_t)argp->ndx; 141 if ((ret = __ham_insertpair(dbc, pagep, &dindx, 142 &argp->key, &argp->data, ktype, dtype)) != 0) 143 goto out; 144 LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; 145 } else if ((opcode == DELPAIR && cmp_p == 0 && DB_REDO(op)) || 146 (opcode == PUTPAIR && cmp_n == 0 && DB_UNDO(op))) { 147 /* Need to undo a put or redo a delete. */ 148 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 149 __ham_dpair(file_dbp, pagep, argp->ndx); 150 LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; 151 } 152 153 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 154 goto out; 155 pagep = NULL; 156 157 /* Return the previous LSN. */ 158done: *lsnp = argp->prev_lsn; 159 ret = 0; 160 161out: if (pagep != NULL) 162 (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); 163 REC_CLOSE; 164} 165 166/* 167 * __ham_newpage_recover -- 168 * This log message is used when we add/remove overflow pages. This 169 * message takes care of the pointer chains, not the data on the pages. 170 * 171 * PUBLIC: int __ham_newpage_recover 172 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 173 */ 174int 175__ham_newpage_recover(env, dbtp, lsnp, op, info) 176 ENV *env; 177 DBT *dbtp; 178 DB_LSN *lsnp; 179 db_recops op; 180 void *info; 181{ 182 __ham_newpage_args *argp; 183 DB_THREAD_INFO *ip; 184 DB *file_dbp; 185 DBC *dbc; 186 DB_MPOOLFILE *mpf; 187 PAGE *pagep; 188 int change, cmp_n, cmp_p, ret; 189 190 ip = ((DB_TXNHEAD *)info)->thread_info; 191 pagep = NULL; 192 REC_PRINT(__ham_newpage_print); 193 REC_INTRO(__ham_newpage_read, ip, 0); 194 195 REC_FGET(mpf, ip, argp->new_pgno, &pagep, ppage); 196 change = 0; 197 198 /* 199 * There are potentially three pages we need to check: the one 200 * that we created/deleted, the one before it and the one after 201 * it. 202 */ 203 204 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 205 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 206 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 207 CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); 208 209 if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) || 210 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) { 211 /* Redo a create new page or undo a delete new page. */ 212 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 213 P_INIT(pagep, file_dbp->pgsize, argp->new_pgno, 214 argp->prev_pgno, argp->next_pgno, 0, P_HASH); 215 change = 1; 216 } else if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DELOVFL) || 217 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) { 218 /* 219 * Redo a delete or undo a create new page. All we 220 * really need to do is change the LSN. 221 */ 222 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 223 change = 1; 224 } 225 226 if (change) 227 LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; 228 229 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 230 goto out; 231 pagep = NULL; 232 233 /* Now do the prev page. */ 234ppage: if (argp->prev_pgno != PGNO_INVALID) { 235 REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage); 236 237 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 238 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn); 239 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn); 240 CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); 241 change = 0; 242 243 if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) || 244 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) { 245 /* Redo a create new page or undo a delete new page. */ 246 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 247 pagep->next_pgno = argp->new_pgno; 248 change = 1; 249 } else if ((cmp_p == 0 && 250 DB_REDO(op) && argp->opcode == DELOVFL) || 251 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) { 252 /* Redo a delete or undo a create new page. */ 253 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 254 pagep->next_pgno = argp->next_pgno; 255 change = 1; 256 } 257 258 if (change) 259 LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn; 260 261 if ((ret = __memp_fput(mpf, 262 ip, pagep, file_dbp->priority)) != 0) 263 goto out; 264 pagep = NULL; 265 } 266 267 /* Now time to do the next page */ 268npage: if (argp->next_pgno != PGNO_INVALID) { 269 REC_FGET(mpf, ip, argp->next_pgno, &pagep, done); 270 271 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 272 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn); 273 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn); 274 CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); 275 change = 0; 276 277 if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) || 278 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) { 279 /* Redo a create new page or undo a delete new page. */ 280 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 281 pagep->prev_pgno = argp->new_pgno; 282 change = 1; 283 } else if ((cmp_p == 0 && 284 DB_REDO(op) && argp->opcode == DELOVFL) || 285 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) { 286 /* Redo a delete or undo a create new page. */ 287 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 288 pagep->prev_pgno = argp->prev_pgno; 289 change = 1; 290 } 291 292 if (change) 293 LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn; 294 295 if ((ret = __memp_fput(mpf, 296 ip, pagep, file_dbp->priority)) != 0) 297 goto out; 298 pagep = NULL; 299 } 300done: *lsnp = argp->prev_lsn; 301 ret = 0; 302 303out: if (pagep != NULL) 304 (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); 305 REC_CLOSE; 306} 307 308/* 309 * __ham_replace_recover -- 310 * This log message refers to partial puts that are local to a single 311 * page. You can think of them as special cases of the more general 312 * insdel log message. 313 * 314 * PUBLIC: int __ham_replace_recover 315 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 316 */ 317int 318__ham_replace_recover(env, dbtp, lsnp, op, info) 319 ENV *env; 320 DBT *dbtp; 321 DB_LSN *lsnp; 322 db_recops op; 323 void *info; 324{ 325 __ham_replace_args *argp; 326 DB_THREAD_INFO *ip; 327 DB *file_dbp; 328 DBC *dbc; 329 DB_MPOOLFILE *mpf; 330 DBT dbt; 331 PAGE *pagep; 332 u_int32_t change; 333 int cmp_n, cmp_p, is_plus, modified, ret; 334 u_int8_t *hk; 335 336 ip = ((DB_TXNHEAD *)info)->thread_info; 337 pagep = NULL; 338 REC_PRINT(__ham_replace_print); 339 REC_INTRO(__ham_replace_read, ip, 0); 340 341 REC_FGET(mpf, ip, argp->pgno, &pagep, done); 342 343 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 344 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 345 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 346 CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); 347 348 memset(&dbt, 0, sizeof(dbt)); 349 modified = 0; 350 351 /* 352 * Before we know the direction of the transformation we will 353 * determine the size differential; then once we know if we are 354 * redoing or undoing, we'll adjust the sign (is_plus) appropriately. 355 */ 356 if (argp->newitem.size > argp->olditem.size) { 357 change = argp->newitem.size - argp->olditem.size; 358 is_plus = 1; 359 } else { 360 change = argp->olditem.size - argp->newitem.size; 361 is_plus = 0; 362 } 363 if (cmp_p == 0 && DB_REDO(op)) { 364 /* Reapply the change as specified. */ 365 dbt.data = argp->newitem.data; 366 dbt.size = argp->newitem.size; 367 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 368 LSN(pagep) = *lsnp; 369 /* 370 * The is_plus flag is set properly to reflect 371 * newitem.size - olditem.size. 372 */ 373 modified = 1; 374 } else if (cmp_n == 0 && DB_UNDO(op)) { 375 /* Undo the already applied change. */ 376 dbt.data = argp->olditem.data; 377 dbt.size = argp->olditem.size; 378 /* 379 * Invert is_plus to reflect sign of 380 * olditem.size - newitem.size. 381 */ 382 is_plus = !is_plus; 383 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 384 LSN(pagep) = argp->pagelsn; 385 modified = 1; 386 } 387 388 if (modified) { 389 __ham_onpage_replace(file_dbp, pagep, 390 argp->ndx, argp->off, change, is_plus, &dbt); 391 if (argp->makedup) { 392 hk = P_ENTRY(file_dbp, pagep, argp->ndx); 393 if (DB_REDO(op)) 394 HPAGE_PTYPE(hk) = H_DUPLICATE; 395 else 396 HPAGE_PTYPE(hk) = H_KEYDATA; 397 } 398 } 399 400 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 401 goto out; 402 pagep = NULL; 403 404done: *lsnp = argp->prev_lsn; 405 ret = 0; 406 407out: if (pagep != NULL) 408 (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); 409 REC_CLOSE; 410} 411 412/* 413 * __ham_splitdata_recover -- 414 * 415 * PUBLIC: int __ham_splitdata_recover 416 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 417 */ 418int 419__ham_splitdata_recover(env, dbtp, lsnp, op, info) 420 ENV *env; 421 DBT *dbtp; 422 DB_LSN *lsnp; 423 db_recops op; 424 void *info; 425{ 426 __ham_splitdata_args *argp; 427 DB_THREAD_INFO *ip; 428 DB *file_dbp; 429 DBC *dbc; 430 DB_MPOOLFILE *mpf; 431 PAGE *pagep; 432 int cmp_n, cmp_p, ret; 433 434 ip = ((DB_TXNHEAD *)info)->thread_info; 435 pagep = NULL; 436 REC_PRINT(__ham_splitdata_print); 437 REC_INTRO(__ham_splitdata_read, ip, 1); 438 439 if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { 440 if (DB_UNDO(op)) { 441 if (ret == DB_PAGE_NOTFOUND) 442 goto done; 443 else { 444 ret = __db_pgerr(file_dbp, argp->pgno, ret); 445 goto out; 446 } 447 } 448 /* If the page is not here then it was later truncated. */ 449 if (!IS_ZERO_LSN(argp->pagelsn)) 450 goto done; 451 /* 452 * This page was created by a group allocation and 453 * the file may not have been extend yet. 454 * Create the page if necessary. 455 */ 456 if ((ret = __memp_fget(mpf, &argp->pgno, 457 ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) { 458 ret = __db_pgerr(file_dbp, argp->pgno, ret); 459 goto out; 460 } 461 } 462 463 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 464 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 465 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 466 CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); 467 468 /* 469 * There are three types of log messages here. Two are related 470 * to an actual page split operation, one for the old page 471 * and one for the new pages created. The original image in the 472 * SPLITOLD record is used for undo. The image in the SPLITNEW 473 * is used for redo. We should never have a case where there is 474 * a redo operation and the SPLITOLD record is on disk, but not 475 * the SPLITNEW record. Therefore, we only have work to do when 476 * redo NEW messages and undo OLD messages, but we have to update 477 * LSNs in both cases. 478 * 479 * The third message is generated when a page is sorted (SORTPAGE). In 480 * an undo the original image in the SORTPAGE is used. In a redo we 481 * recreate the sort operation by calling __ham_sort_page. 482 */ 483 if (cmp_p == 0 && DB_REDO(op)) { 484 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 485 if (argp->opcode == SPLITNEW) 486 /* Need to redo the split described. */ 487 memcpy(pagep, argp->pageimage.data, 488 argp->pageimage.size); 489 else if (argp->opcode == SORTPAGE) { 490 if ((ret = __ham_sort_page(dbc, NULL, pagep)) != 0) 491 goto out; 492 } 493 LSN(pagep) = *lsnp; 494 } else if (cmp_n == 0 && DB_UNDO(op)) { 495 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 496 if (argp->opcode == SPLITOLD || argp->opcode == SORTPAGE) { 497 /* Put back the old image. */ 498 memcpy(pagep, argp->pageimage.data, 499 argp->pageimage.size); 500 } else 501 P_INIT(pagep, file_dbp->pgsize, argp->pgno, 502 PGNO_INVALID, PGNO_INVALID, 0, P_HASH); 503 LSN(pagep) = argp->pagelsn; 504 } 505 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 506 goto out; 507 pagep = NULL; 508 509done: *lsnp = argp->prev_lsn; 510 ret = 0; 511 512out: if (pagep != NULL) 513 (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); 514 REC_CLOSE; 515} 516 517/* 518 * __ham_copypage_recover -- 519 * Recovery function for copypage. 520 * 521 * PUBLIC: int __ham_copypage_recover 522 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 523 */ 524int 525__ham_copypage_recover(env, dbtp, lsnp, op, info) 526 ENV *env; 527 DBT *dbtp; 528 DB_LSN *lsnp; 529 db_recops op; 530 void *info; 531{ 532 __ham_copypage_args *argp; 533 DB_THREAD_INFO *ip; 534 DB *file_dbp; 535 DBC *dbc; 536 DB_MPOOLFILE *mpf; 537 PAGE *pagep; 538 int cmp_n, cmp_p, ret; 539 540 ip = ((DB_TXNHEAD *)info)->thread_info; 541 pagep = NULL; 542 REC_PRINT(__ham_copypage_print); 543 REC_INTRO(__ham_copypage_read, ip, 0); 544 545 /* This is the bucket page. */ 546 REC_FGET(mpf, ip, argp->pgno, &pagep, donext); 547 548 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 549 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 550 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 551 552 if (cmp_p == 0 && DB_REDO(op)) { 553 /* Need to redo update described. */ 554 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 555 memcpy(pagep, argp->page.data, argp->page.size); 556 PGNO(pagep) = argp->pgno; 557 PREV_PGNO(pagep) = PGNO_INVALID; 558 LSN(pagep) = *lsnp; 559 } else if (cmp_n == 0 && DB_UNDO(op)) { 560 /* Need to undo update described. */ 561 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 562 P_INIT(pagep, file_dbp->pgsize, argp->pgno, PGNO_INVALID, 563 argp->next_pgno, 0, P_HASH); 564 LSN(pagep) = argp->pagelsn; 565 } 566 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 567 goto out; 568 pagep = NULL; 569 570donext: /* Now fix up the "next" page. */ 571 REC_FGET(mpf, ip, argp->next_pgno, &pagep, do_nn); 572 573 /* For REDO just update the LSN. For UNDO copy page back. */ 574 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 575 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn); 576 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn); 577 CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); 578 if (cmp_p == 0 && DB_REDO(op)) { 579 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 580 LSN(pagep) = *lsnp; 581 } else if (cmp_n == 0 && DB_UNDO(op)) { 582 /* Need to undo update described. */ 583 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 584 memcpy(pagep, argp->page.data, argp->page.size); 585 } 586 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 587 goto out; 588 pagep = NULL; 589 590 /* Now fix up the next's next page. */ 591do_nn: if (argp->nnext_pgno == PGNO_INVALID) 592 goto done; 593 594 REC_FGET(mpf, ip, argp->nnext_pgno, &pagep, done); 595 596 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 597 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nnextlsn); 598 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nnextlsn); 599 CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); 600 601 if (cmp_p == 0 && DB_REDO(op)) { 602 /* Need to redo update described. */ 603 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 604 PREV_PGNO(pagep) = argp->pgno; 605 LSN(pagep) = *lsnp; 606 } else if (cmp_n == 0 && DB_UNDO(op)) { 607 /* Need to undo update described. */ 608 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 609 PREV_PGNO(pagep) = argp->next_pgno; 610 LSN(pagep) = argp->nnextlsn; 611 } 612 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 613 goto out; 614 pagep = NULL; 615 616done: *lsnp = argp->prev_lsn; 617 ret = 0; 618 619out: if (pagep != NULL) 620 (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); 621 REC_CLOSE; 622} 623 624/* 625 * __ham_metagroup_recover -- 626 * Recovery function for metagroup. 627 * 628 * PUBLIC: int __ham_metagroup_recover 629 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 630 */ 631int 632__ham_metagroup_recover(env, dbtp, lsnp, op, info) 633 ENV *env; 634 DBT *dbtp; 635 DB_LSN *lsnp; 636 db_recops op; 637 void *info; 638{ 639 __ham_metagroup_args *argp; 640 DB_THREAD_INFO *ip; 641 HASH_CURSOR *hcp; 642 DB *file_dbp; 643 DBMETA *mmeta; 644 DBC *dbc; 645 DB_MPOOLFILE *mpf; 646 PAGE *pagep; 647 db_pgno_t pgno; 648 int cmp_n, cmp_p, did_alloc, groupgrow, ret; 649 650 ip = ((DB_TXNHEAD *)info)->thread_info; 651 mmeta = NULL; 652 did_alloc = 0; 653 REC_PRINT(__ham_metagroup_print); 654 REC_INTRO(__ham_metagroup_read, ip, 1); 655 656 /* 657 * This logs the virtual create of pages pgno to pgno + bucket. 658 * The log record contains: 659 * bucket: old maximum bucket 660 * pgno: page number of the new bucket. 661 * We round up on log calculations, so we can figure out if we are 662 * about to double the hash table if argp->bucket+1 is a power of 2. 663 * If it is, then we are allocating an entire doubling of pages, 664 * otherwise, we are simply allocated one new page. 665 */ 666 groupgrow = 667 (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1; 668 pgno = argp->pgno; 669 if (argp->newalloc) 670 pgno += argp->bucket; 671 672 pagep = NULL; 673 ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep); 674 675 /* If we are undoing, then we don't want to create the page. */ 676 if (ret != 0 && DB_REDO(op)) 677 ret = __memp_fget(mpf, 678 &pgno, ip, NULL, DB_MPOOL_CREATE, &pagep); 679 else if (ret == DB_PAGE_NOTFOUND) 680 goto do_meta; 681 if (ret != 0) { 682 if (ret != ENOSPC) 683 goto out; 684 pgno = 0; 685 goto do_meta; 686 } 687 688 /* 689 * When we get here then either we did not grow the file 690 * (groupgrow == 0) or we did grow the file and the allocation 691 * of those new pages succeeded. 692 */ 693 did_alloc = groupgrow; 694 695 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 696 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 697 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 698 699 if (cmp_p == 0 && DB_REDO(op)) { 700 REC_DIRTY(mpf, ip, dbc->priority, &pagep); 701 pagep->lsn = *lsnp; 702 } else if (cmp_n == 0 && DB_UNDO(op)) { 703 /* If this record allocated the pages give them back. */ 704 if (argp->newalloc) { 705 if (pagep != NULL && (ret = __memp_fput(mpf, 706 ip, pagep, DB_PRIORITY_VERY_LOW)) != 0) 707 goto out; 708 pagep = NULL; 709 if ((ret = __memp_ftruncate(mpf, NULL, ip, 710 argp->pgno, 0)) != 0) 711 goto out; 712 } else { 713 /* 714 * Otherwise just roll the page back to its 715 * previous state. 716 */ 717 REC_DIRTY(mpf, ip, dbc->priority, &pagep); 718 pagep->lsn = argp->pagelsn; 719 } 720 } 721 if (pagep != NULL && 722 (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) 723 goto out; 724 725 /* 726 * If a earlier aborted allocation used one of our pages it may 727 * be in the wrong state, read all the pages in the group and init 728 * them to be empty. 729 */ 730 if (DB_REDO(op) && argp->newalloc) { 731 for (pgno = argp->pgno; 732 pgno < argp->pgno + argp->bucket; pgno++) { 733 if ((ret = __memp_fget(mpf, 734 &pgno, ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) 735 goto out; 736 737 if (IS_ZERO_LSN(LSN(pagep))) 738 P_INIT(pagep, file_dbp->pgsize, 739 PGNO_INVALID, PGNO_INVALID, PGNO_INVALID, 740 0, P_HASH); 741 if ((ret = 742 __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) 743 goto out; 744 } 745 } 746 747do_meta: 748 /* Now we have to update the meta-data page. */ 749 hcp = (HASH_CURSOR *)dbc->internal; 750 if ((ret = __ham_get_meta(dbc)) != 0) 751 goto out; 752 cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn); 753 cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn); 754 CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn); 755 CHECK_ABORT(env, op, cmp_n, &hcp->hdr->dbmeta.lsn, lsnp); 756 if (cmp_p == 0 && DB_REDO(op)) { 757 /* Redo the actual updating of bucket counts. */ 758 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 759 ++hcp->hdr->max_bucket; 760 if (groupgrow) { 761 hcp->hdr->low_mask = hcp->hdr->high_mask; 762 hcp->hdr->high_mask = 763 (argp->bucket + 1) | hcp->hdr->low_mask; 764 } 765 hcp->hdr->dbmeta.lsn = *lsnp; 766 } else if (cmp_n == 0 && DB_UNDO(op)) { 767 /* Undo the actual updating of bucket counts. */ 768 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 769 hcp->hdr->max_bucket = argp->bucket; 770 if (groupgrow) { 771 hcp->hdr->high_mask = argp->bucket; 772 hcp->hdr->low_mask = hcp->hdr->high_mask >> 1; 773 } 774 hcp->hdr->dbmeta.lsn = argp->metalsn; 775 } 776 777 /* 778 * Now we need to fix up the spares array. Each entry in the 779 * spares array indicates the beginning page number for the 780 * indicated doubling. We need to fill this in whenever the 781 * spares array is invalid, if we never reclaim pages then 782 * we have to allocate the pages to the spares array in both 783 * the redo and undo cases. 784 */ 785 if (did_alloc && !DB_UNDO(op) && 786 hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == PGNO_INVALID) { 787 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 788 hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] = 789 (argp->pgno - argp->bucket) - 1; 790 } 791 if (cmp_n == 0 && groupgrow && DB_UNDO(op)) { 792 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 793 hcp->hdr->spares[ 794 __db_log2(argp->bucket + 1) + 1] = PGNO_INVALID; 795 } 796 797 /* 798 * Finally, we need to potentially fix up the last_pgno field 799 * in the master meta-data page (which may or may not be the 800 * same as the hash header page). 801 */ 802 if (argp->mmpgno != argp->mpgno) { 803 if ((ret = __memp_fget(mpf, 804 &argp->mmpgno, ip, NULL, DB_MPOOL_EDIT, &mmeta)) != 0) { 805 if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND) 806 ret = 0; 807 goto out; 808 } 809 cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn); 810 cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn); 811 if (cmp_p == 0 && DB_REDO(op)) { 812 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 813 mmeta->lsn = *lsnp; 814 } else if (cmp_n == 0 && DB_UNDO(op)) { 815 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 816 mmeta->lsn = argp->mmetalsn; 817 } 818 } else { 819 mmeta = (DBMETA *)hcp->hdr; 820 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 821 } 822 823 if (cmp_n == 0 && DB_UNDO(op)) 824 mmeta->last_pgno = argp->last_pgno; 825 else if (DB_REDO(op) && mmeta->last_pgno < pgno) 826 mmeta->last_pgno = pgno; 827 828 if (argp->mmpgno != argp->mpgno && 829 (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0) 830 goto out; 831 mmeta = NULL; 832 833done: *lsnp = argp->prev_lsn; 834 ret = 0; 835 836out: if (mmeta != NULL) 837 (void)__memp_fput(mpf, ip, mmeta, dbc->priority); 838 if (dbc != NULL) 839 (void)__ham_release_meta(dbc); 840 841 REC_CLOSE; 842} 843 844/* 845 * __ham_groupalloc_recover -- 846 * Recover the batch creation of a set of pages for a new database. 847 * 848 * PUBLIC: int __ham_groupalloc_recover 849 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 850 */ 851int 852__ham_groupalloc_recover(env, dbtp, lsnp, op, info) 853 ENV *env; 854 DBT *dbtp; 855 DB_LSN *lsnp; 856 db_recops op; 857 void *info; 858{ 859 __ham_groupalloc_args *argp; 860 DB_THREAD_INFO *ip; 861 DBMETA *mmeta; 862 DB_MPOOLFILE *mpf; 863 DB *file_dbp; 864 DBC *dbc; 865 PAGE *pagep; 866 db_pgno_t pgno; 867 int cmp_n, cmp_p, ret; 868 869 ip = ((DB_TXNHEAD *)info)->thread_info; 870 mmeta = NULL; 871 REC_PRINT(__ham_groupalloc_print); 872 REC_INTRO(__ham_groupalloc_read, ip, 1); 873 874 pgno = PGNO_BASE_MD; 875 if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) { 876 if (DB_REDO(op)) { 877 ret = __db_pgerr(file_dbp, pgno, ret); 878 goto out; 879 } else 880 goto done; 881 } 882 883 cmp_n = LOG_COMPARE(lsnp, &LSN(mmeta)); 884 cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn); 885 CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn); 886 CHECK_ABORT(env, op, cmp_n, &LSN(mmeta), lsnp); 887 888 /* 889 * Basically, we used mpool to allocate a chunk of pages. 890 * We need to either add those to a free list (in the undo 891 * case) or initialize them (in the redo case). 892 * 893 * If we are redoing and this is a hash subdatabase, it's possible 894 * that the pages were never allocated, so we'd better check for 895 * that and handle it here. 896 */ 897 pgno = argp->start_pgno + argp->num - 1; 898 if (DB_REDO(op)) { 899 if ((ret = __ham_alloc_pages(dbc, argp, lsnp)) != 0) 900 goto out; 901 if (cmp_p == 0) { 902 REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta); 903 LSN(mmeta) = *lsnp; 904 } 905 } else if (DB_UNDO(op)) { 906 /* 907 * Fetch the last page and determine if it is in 908 * the post allocation state. 909 */ 910 pagep = NULL; 911 if ((ret = __memp_fget(mpf, &pgno, 912 ip, NULL, DB_MPOOL_EDIT, &pagep)) == 0) { 913 if (LOG_COMPARE(&pagep->lsn, lsnp) != 0) { 914 if ((ret = __memp_fput(mpf, ip, 915 pagep, DB_PRIORITY_VERY_LOW)) != 0) 916 goto out; 917 pagep = NULL; 918 } 919 } else if (ret != DB_PAGE_NOTFOUND) 920 goto out; 921 /* 922 * If the last page was allocated then truncate back 923 * to the first page. 924 */ 925 if (pagep != NULL) { 926 if ((ret = __memp_fput(mpf, ip, 927 pagep, DB_PRIORITY_VERY_LOW)) != 0) 928 goto out; 929 if ((ret = __memp_ftruncate(mpf, NULL, 930 ip, argp->start_pgno, 0)) != 0) 931 goto out; 932 } 933 934 /* 935 * If we are rolling back the metapage, then make 936 * sure it reflects the the correct last_pgno. 937 */ 938 if (cmp_n == 0) { 939 REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta); 940 mmeta->last_pgno = argp->last_pgno; 941 } 942 pgno = 0; 943 if (cmp_n == 0) { 944 REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta); 945 LSN(mmeta) = argp->meta_lsn; 946 } 947 } 948 949 /* 950 * Set the last page number to the current value. 951 */ 952 if (pgno > mmeta->last_pgno) { 953 REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta); 954 mmeta->last_pgno = pgno; 955 } 956 957done: if (ret == 0) 958 *lsnp = argp->prev_lsn; 959 ret = 0; 960 961out: if (mmeta != NULL) 962 (void)__memp_fput(mpf, ip, mmeta, file_dbp->priority); 963 964 REC_CLOSE; 965} 966 967/* 968 * __ham_alloc_pages -- 969 * 970 * Called during redo of a file create. We create new pages in the file 971 * using the MPOOL_NEW_GROUP flag. We then log the meta-data page with a 972 * __crdel_metasub message. If we manage to crash without the newly written 973 * pages getting to disk (I'm not sure this can happen anywhere except our 974 * test suite?!), then we need to go through a recreate the final pages. 975 * Hash normally has holes in its files and handles them appropriately. 976 */ 977static int 978__ham_alloc_pages(dbc, argp, lsnp) 979 DBC *dbc; 980 __ham_groupalloc_args *argp; 981 DB_LSN *lsnp; 982{ 983 DB *file_dbp; 984 DB_MPOOLFILE *mpf; 985 DB_THREAD_INFO *ip; 986 PAGE *pagep; 987 db_pgno_t pgno; 988 int ret; 989 990 file_dbp = dbc->dbp; 991 mpf = file_dbp->mpf; 992 ip = dbc->thread_info; 993 994 /* Read the last page of the allocation. */ 995 pgno = argp->start_pgno + argp->num - 1; 996 997 /* If the page exists, and it has been initialized, then we're done. */ 998 if ((ret = 999 __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) == 0) { 1000 if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn)) 1001 goto reinit_page; 1002 return (__memp_fput(mpf, ip, pagep, dbc->priority)); 1003 } 1004 1005 /* Had to create the page. */ 1006 if ((ret = __memp_fget(mpf, &pgno, 1007 ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) 1008 return (__db_pgerr(dbc->dbp, pgno, ret)); 1009 1010reinit_page: 1011 /* Initialize the newly allocated page. */ 1012 REC_DIRTY(mpf, ip, dbc->priority, &pagep); 1013 P_INIT(pagep, dbc->dbp->pgsize, 1014 pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH); 1015 pagep->lsn = *lsnp; 1016 1017out: return (__memp_fput(mpf, ip, pagep, dbc->priority)); 1018} 1019 1020/* 1021 * __ham_curadj_recover -- 1022 * Undo cursor adjustments if a subtransaction fails. 1023 * 1024 * PUBLIC: int __ham_curadj_recover 1025 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 1026 */ 1027int 1028__ham_curadj_recover(env, dbtp, lsnp, op, info) 1029 ENV *env; 1030 DBT *dbtp; 1031 DB_LSN *lsnp; 1032 db_recops op; 1033 void *info; 1034{ 1035 __ham_curadj_args *argp; 1036 db_ham_curadj mode, hamc_mode; 1037 DB_THREAD_INFO *ip; 1038 DB_MPOOLFILE *mpf; 1039 DB *file_dbp; 1040 DBC *dbc; 1041 HASH_CURSOR *hcp; 1042 int ret; 1043 1044 ip = ((DB_TXNHEAD *)info)->thread_info; 1045 REC_PRINT(__ham_curadj_print); 1046 REC_INTRO(__ham_curadj_read, ip, 1); 1047 1048 if (op != DB_TXN_ABORT) 1049 goto done; 1050 1051 mode = (db_ham_curadj)argp->add; 1052 1053 /* 1054 * Reverse the logged operation, so that the consequences are reversed 1055 * by the __hamc_update code. 1056 */ 1057 switch (mode) { 1058 case DB_HAM_CURADJ_DEL: 1059 hamc_mode = DB_HAM_CURADJ_ADD; 1060 break; 1061 case DB_HAM_CURADJ_ADD: 1062 hamc_mode = DB_HAM_CURADJ_DEL; 1063 break; 1064 case DB_HAM_CURADJ_ADDMOD: 1065 hamc_mode = DB_HAM_CURADJ_DELMOD; 1066 break; 1067 case DB_HAM_CURADJ_DELMOD: 1068 hamc_mode = DB_HAM_CURADJ_ADDMOD; 1069 break; 1070 default: 1071 __db_errx(env, 1072 "Invalid flag in __ham_curadj_recover"); 1073 ret = EINVAL; 1074 goto out; 1075 } 1076 1077 /* 1078 * Undo the adjustment by reinitializing the the cursor to look like 1079 * the one that was used to do the adjustment, then we invert the 1080 * add so that undo the adjustment. 1081 */ 1082 hcp = (HASH_CURSOR *)dbc->internal; 1083 hcp->pgno = argp->pgno; 1084 hcp->indx = argp->indx; 1085 hcp->dup_off = argp->dup_off; 1086 hcp->order = argp->order; 1087 if (mode == DB_HAM_CURADJ_DEL) 1088 F_SET(hcp, H_DELETED); 1089 (void)__hamc_update(dbc, argp->len, hamc_mode, argp->is_dup); 1090 1091done: *lsnp = argp->prev_lsn; 1092out: REC_CLOSE; 1093} 1094 1095/* 1096 * __ham_chgpg_recover -- 1097 * Undo cursor adjustments if a subtransaction fails. 1098 * 1099 * PUBLIC: int __ham_chgpg_recover 1100 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 1101 */ 1102int 1103__ham_chgpg_recover(env, dbtp, lsnp, op, info) 1104 ENV *env; 1105 DBT *dbtp; 1106 DB_LSN *lsnp; 1107 db_recops op; 1108 void *info; 1109{ 1110 __ham_chgpg_args *argp; 1111 DB_THREAD_INFO *ip; 1112 BTREE_CURSOR *opdcp; 1113 DB_MPOOLFILE *mpf; 1114 DB *file_dbp, *ldbp; 1115 DBC *dbc; 1116 DBC *cp; 1117 HASH_CURSOR *lcp; 1118 u_int32_t order, indx; 1119 int ret; 1120 1121 ip = ((DB_TXNHEAD *)info)->thread_info; 1122 REC_PRINT(__ham_chgpg_print); 1123 REC_INTRO(__ham_chgpg_read, ip, 0); 1124 1125 if (op != DB_TXN_ABORT) 1126 goto done; 1127 1128 /* Overloaded fields for DB_HAM_DEL*PG */ 1129 indx = argp->old_indx; 1130 order = argp->new_indx; 1131 1132 MUTEX_LOCK(env, env->mtx_dblist); 1133 FIND_FIRST_DB_MATCH(env, file_dbp, ldbp); 1134 for (; 1135 ldbp != NULL && ldbp->adj_fileid == file_dbp->adj_fileid; 1136 ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { 1137 MUTEX_LOCK(env, file_dbp->mutex); 1138 TAILQ_FOREACH(cp, &ldbp->active_queue, links) { 1139 lcp = (HASH_CURSOR *)cp->internal; 1140 1141 switch (argp->mode) { 1142 case DB_HAM_DELFIRSTPG: 1143 if (lcp->pgno != argp->new_pgno || 1144 MVCC_SKIP_CURADJ(cp, lcp->pgno)) 1145 break; 1146 if (lcp->indx != indx || 1147 !F_ISSET(lcp, H_DELETED) || 1148 lcp->order >= order) { 1149 lcp->pgno = argp->old_pgno; 1150 if (lcp->indx == indx) 1151 lcp->order -= order; 1152 } 1153 break; 1154 case DB_HAM_DELMIDPG: 1155 case DB_HAM_DELLASTPG: 1156 if (lcp->pgno == argp->new_pgno && 1157 lcp->indx == indx && 1158 F_ISSET(lcp, H_DELETED) && 1159 lcp->order >= order && 1160 !MVCC_SKIP_CURADJ(cp, lcp->pgno)) { 1161 lcp->pgno = argp->old_pgno; 1162 lcp->order -= order; 1163 lcp->indx = 0; 1164 } 1165 break; 1166 case DB_HAM_CHGPG: 1167 /* 1168 * If we're doing a CHGPG, we're undoing 1169 * the move of a non-deleted item to a 1170 * new page. Any cursors with the deleted 1171 * flag set do not belong to this item; 1172 * don't touch them. 1173 */ 1174 if (F_ISSET(lcp, H_DELETED)) 1175 break; 1176 /* FALLTHROUGH */ 1177 case DB_HAM_SPLIT: 1178 if (lcp->pgno == argp->new_pgno && 1179 lcp->indx == argp->new_indx && 1180 !MVCC_SKIP_CURADJ(cp, lcp->pgno)) { 1181 lcp->indx = argp->old_indx; 1182 lcp->pgno = argp->old_pgno; 1183 } 1184 break; 1185 case DB_HAM_DUP: 1186 if (lcp->opd == NULL) 1187 break; 1188 opdcp = (BTREE_CURSOR *)lcp->opd->internal; 1189 if (opdcp->pgno != argp->new_pgno || 1190 opdcp->indx != argp->new_indx || 1191 MVCC_SKIP_CURADJ(lcp->opd, opdcp->pgno)) 1192 break; 1193 1194 if (F_ISSET(opdcp, C_DELETED)) 1195 F_SET(lcp, H_DELETED); 1196 /* 1197 * We can't close a cursor while we have the 1198 * dbp mutex locked, since c_close reacquires 1199 * it. It should be safe to drop the mutex 1200 * here, though, since newly opened cursors 1201 * are put only at the end of the tailq and 1202 * the cursor we're adjusting can't be closed 1203 * under us. 1204 */ 1205 MUTEX_UNLOCK(env, file_dbp->mutex); 1206 if ((ret = __dbc_close(lcp->opd)) != 0) 1207 goto out; 1208 MUTEX_LOCK(env, file_dbp->mutex); 1209 lcp->opd = NULL; 1210 break; 1211 } 1212 } 1213 MUTEX_UNLOCK(env, file_dbp->mutex); 1214 } 1215 MUTEX_UNLOCK(env, env->mtx_dblist); 1216 1217done: *lsnp = argp->prev_lsn; 1218out: REC_CLOSE; 1219} 1220 1221/* 1222 * __ham_metagroup_recover -- 1223 * Recovery function for metagroup. 1224 * 1225 * PUBLIC: int __ham_metagroup_42_recover 1226 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 1227 */ 1228int 1229__ham_metagroup_42_recover(env, dbtp, lsnp, op, info) 1230 ENV *env; 1231 DBT *dbtp; 1232 DB_LSN *lsnp; 1233 db_recops op; 1234 void *info; 1235{ 1236 __ham_metagroup_42_args *argp; 1237 DB_THREAD_INFO *ip; 1238 HASH_CURSOR *hcp; 1239 DB *file_dbp; 1240 DBMETA *mmeta; 1241 DBC *dbc; 1242 DB_MPOOLFILE *mpf; 1243 PAGE *pagep; 1244 db_pgno_t pgno; 1245 u_int32_t flags; 1246 int cmp_n, cmp_p, did_alloc, groupgrow, ret; 1247 1248 ip = ((DB_TXNHEAD *)info)->thread_info; 1249 mmeta = NULL; 1250 did_alloc = 0; 1251 REC_PRINT(__ham_metagroup_42_print); 1252 REC_INTRO(__ham_metagroup_42_read, ip, 1); 1253 1254 /* 1255 * This logs the virtual create of pages pgno to pgno + bucket 1256 * If HAVE_FTRUNCATE is not supported the mpool page-allocation is not 1257 * transaction protected, we can never undo it. Even in an abort, 1258 * we have to allocate these pages to the hash table if they 1259 * were actually created. In particular, during disaster 1260 * recovery the metapage may be before this point if we 1261 * are rolling backward. If the file has not been extended 1262 * then the metapage could not have been updated. 1263 * The log record contains: 1264 * bucket: old maximum bucket 1265 * pgno: page number of the new bucket. 1266 * We round up on log calculations, so we can figure out if we are 1267 * about to double the hash table if argp->bucket+1 is a power of 2. 1268 * If it is, then we are allocating an entire doubling of pages, 1269 * otherwise, we are simply allocated one new page. 1270 */ 1271 groupgrow = 1272 (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1; 1273 pgno = argp->pgno; 1274 if (argp->newalloc) 1275 pgno += argp->bucket; 1276 1277 flags = 0; 1278 pagep = NULL; 1279 LF_SET(DB_MPOOL_CREATE); 1280 ret = __memp_fget(mpf, &pgno, ip, NULL, flags, &pagep); 1281 1282 if (ret != 0) { 1283 if (ret != ENOSPC) 1284 goto out; 1285 pgno = 0; 1286 goto do_meta; 1287 } 1288 1289 /* 1290 * When we get here then either we did not grow the file 1291 * (groupgrow == 0) or we did grow the file and the allocation 1292 * of those new pages succeeded. 1293 */ 1294 did_alloc = groupgrow; 1295 1296 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 1297 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 1298 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 1299 1300 if (cmp_p == 0 && DB_REDO(op)) { 1301 REC_DIRTY(mpf, ip, dbc->priority, &pagep); 1302 pagep->lsn = *lsnp; 1303 } else if (cmp_n == 0 && DB_UNDO(op)) { 1304 /* 1305 * Otherwise just roll the page back to its 1306 * previous state. 1307 */ 1308 REC_DIRTY(mpf, ip, dbc->priority, &pagep); 1309 pagep->lsn = argp->pagelsn; 1310 } 1311 if (pagep != NULL && 1312 (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) 1313 goto out; 1314 1315do_meta: 1316 /* Now we have to update the meta-data page. */ 1317 hcp = (HASH_CURSOR *)dbc->internal; 1318 if ((ret = __ham_get_meta(dbc)) != 0) 1319 goto out; 1320 cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn); 1321 cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn); 1322 CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn); 1323 if (cmp_p == 0 && DB_REDO(op)) { 1324 /* Redo the actual updating of bucket counts. */ 1325 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 1326 ++hcp->hdr->max_bucket; 1327 if (groupgrow) { 1328 hcp->hdr->low_mask = hcp->hdr->high_mask; 1329 hcp->hdr->high_mask = 1330 (argp->bucket + 1) | hcp->hdr->low_mask; 1331 } 1332 hcp->hdr->dbmeta.lsn = *lsnp; 1333 } else if (cmp_n == 0 && DB_UNDO(op)) { 1334 /* Undo the actual updating of bucket counts. */ 1335 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 1336 hcp->hdr->max_bucket = argp->bucket; 1337 if (groupgrow) { 1338 hcp->hdr->high_mask = argp->bucket; 1339 hcp->hdr->low_mask = hcp->hdr->high_mask >> 1; 1340 } 1341 hcp->hdr->dbmeta.lsn = argp->metalsn; 1342 } 1343 1344 /* 1345 * Now we need to fix up the spares array. Each entry in the 1346 * spares array indicates the beginning page number for the 1347 * indicated doubling. We need to fill this in whenever the 1348 * spares array is invalid, if we never reclaim pages then 1349 * we have to allocate the pages to the spares array in both 1350 * the redo and undo cases. 1351 */ 1352 if (did_alloc && 1353 hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == PGNO_INVALID) { 1354 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 1355 hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] = 1356 (argp->pgno - argp->bucket) - 1; 1357 } 1358 1359 /* 1360 * Finally, we need to potentially fix up the last_pgno field 1361 * in the master meta-data page (which may or may not be the 1362 * same as the hash header page). 1363 */ 1364 if (argp->mmpgno != argp->mpgno) { 1365 if ((ret = __memp_fget(mpf, &argp->mmpgno, ip, NULL, 1366 DB_MPOOL_EDIT, &mmeta)) != 0) { 1367 if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND) 1368 ret = 0; 1369 goto out; 1370 } 1371 cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn); 1372 cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn); 1373 if (cmp_p == 0 && DB_REDO(op)) { 1374 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 1375 mmeta->lsn = *lsnp; 1376 } else if (cmp_n == 0 && DB_UNDO(op)) { 1377 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 1378 mmeta->lsn = argp->mmetalsn; 1379 } 1380 } else { 1381 mmeta = (DBMETA *)hcp->hdr; 1382 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 1383 } 1384 1385 if (mmeta->last_pgno < pgno) 1386 mmeta->last_pgno = pgno; 1387 1388 if (argp->mmpgno != argp->mpgno && 1389 (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0) 1390 goto out; 1391 mmeta = NULL; 1392 1393done: *lsnp = argp->prev_lsn; 1394 ret = 0; 1395 1396out: if (mmeta != NULL) 1397 (void)__memp_fput(mpf, ip, mmeta, dbc->priority); 1398 if (dbc != NULL) 1399 (void)__ham_release_meta(dbc); 1400 1401 REC_CLOSE; 1402} 1403 1404/* 1405 * __ham_groupalloc_42_recover -- 1406 * Recover the batch creation of a set of pages for a new database. 1407 * 1408 * PUBLIC: int __ham_groupalloc_42_recover 1409 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 1410 */ 1411int 1412__ham_groupalloc_42_recover(env, dbtp, lsnp, op, info) 1413 ENV *env; 1414 DBT *dbtp; 1415 DB_LSN *lsnp; 1416 db_recops op; 1417 void *info; 1418{ 1419 __ham_groupalloc_42_args *argp; 1420 DB_THREAD_INFO *ip; 1421 DBMETA *mmeta; 1422 DB_MPOOLFILE *mpf; 1423 DB *file_dbp; 1424 DBC *dbc; 1425 db_pgno_t pgno; 1426 int cmp_p, ret; 1427 1428 ip = ((DB_TXNHEAD *)info)->thread_info; 1429 mmeta = NULL; 1430 REC_PRINT(__ham_groupalloc_42_print); 1431 REC_INTRO(__ham_groupalloc_42_read, ip, 1); 1432 1433 pgno = PGNO_BASE_MD; 1434 if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) { 1435 if (DB_REDO(op)) { 1436 ret = __db_pgerr(file_dbp, pgno, ret); 1437 goto out; 1438 } else 1439 goto done; 1440 } 1441 1442 cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn); 1443 CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn); 1444 1445 /* 1446 * Basically, we used mpool to allocate a chunk of pages. 1447 * We need to either add those to a free list (in the undo 1448 * case) or initialize them (in the redo case). 1449 * 1450 * If we are redoing and this is a hash subdatabase, it's possible 1451 * that the pages were never allocated, so we'd better check for 1452 * that and handle it here. 1453 */ 1454 pgno = argp->start_pgno + argp->num - 1; 1455 if (DB_REDO(op)) { 1456 if ((ret = __ham_alloc_pages_42(dbc, argp, lsnp)) != 0) 1457 goto out; 1458 if (cmp_p == 0) { 1459 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 1460 LSN(mmeta) = *lsnp; 1461 } 1462 } else if (DB_UNDO(op)) { 1463 /* 1464 * We cannot roll back 4.2 style allocations. 1465 */ 1466 __db_errx(env, 1467"Cannot replicate prepared transactions from master running release 4.2."); 1468 ret = __env_panic(env, EINVAL); 1469 goto out; 1470 } 1471 1472 /* 1473 * In both REDO and UNDO, we have grown the file and need to make 1474 * sure that last_pgno is correct. If we HAVE_FTRUNCATE pgno 1475 * will only be valid on REDO. 1476 */ 1477 if (pgno > mmeta->last_pgno) { 1478 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 1479 mmeta->last_pgno = pgno; 1480 } 1481 1482done: if (ret == 0) 1483 *lsnp = argp->prev_lsn; 1484 ret = 0; 1485 1486out: if (mmeta != NULL) 1487 (void)__memp_fput(mpf, ip, mmeta, dbc->priority); 1488 1489 REC_CLOSE; 1490} 1491 1492/* 1493 * __ham_alloc_pages_42 -- 1494 * 1495 * Called during redo of a file create. We create new pages in the file 1496 * using the MPOOL_NEW_GROUP flag. We then log the meta-data page with a 1497 * __crdel_metasub message. If we manage to crash without the newly written 1498 * pages getting to disk (I'm not sure this can happen anywhere except our 1499 * test suite?!), then we need to go through a recreate the final pages. 1500 * Hash normally has holes in its files and handles them appropriately. 1501 */ 1502static int 1503__ham_alloc_pages_42(dbc, argp, lsnp) 1504 DBC *dbc; 1505 __ham_groupalloc_42_args *argp; 1506 DB_LSN *lsnp; 1507{ 1508 DB_MPOOLFILE *mpf; 1509 DB_THREAD_INFO *ip; 1510 PAGE *pagep; 1511 db_pgno_t pgno; 1512 int ret; 1513 1514 mpf = dbc->dbp->mpf; 1515 ip = dbc->thread_info; 1516 1517 /* Read the last page of the allocation. */ 1518 pgno = argp->start_pgno + argp->num - 1; 1519 1520 /* If the page exists, and it has been initialized, then we're done. */ 1521 if ((ret = __memp_fget(mpf, 1522 &pgno, ip, NULL, 0, &pagep)) == 0) { 1523 if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn)) 1524 goto reinit_page; 1525 if ((ret = __memp_fput(mpf, 1526 ip, pagep, dbc->priority)) != 0) 1527 return (ret); 1528 return (0); 1529 } 1530 1531 /* Had to create the page. */ 1532 if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 1533 DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) != 0) 1534 return (__db_pgerr(dbc->dbp, pgno, ret)); 1535 1536reinit_page: 1537 /* Initialize the newly allocated page. */ 1538 P_INIT(pagep, 1539 dbc->dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH); 1540 pagep->lsn = *lsnp; 1541 1542 if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) 1543 return (ret); 1544 1545 return (0); 1546} 1547