1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1995, 1996 8 * Margo Seltzer. All rights reserved. 9 */ 10/* 11 * Copyright (c) 1995, 1996 12 * The President and Fellows of Harvard University. All rights reserved. 13 * 14 * This code is derived from software contributed to Berkeley by 15 * Margo Seltzer. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. Neither the name of the University nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 * SUCH DAMAGE. 40 * 41 * $Id: hash_rec.c,v 12.44 2008/02/18 04:46:43 mjc Exp $ 42 */ 43 44#include "db_config.h" 45 46#include "db_int.h" 47#include "dbinc/db_page.h" 48#include "dbinc/btree.h" 49#include "dbinc/hash.h" 50#include "dbinc/log.h" 51#include "dbinc/mp.h" 52 53static int __ham_alloc_pages __P((DBC *, __ham_groupalloc_args *, DB_LSN *)); 54static int __ham_alloc_pages_42 55 __P((DBC *, __ham_groupalloc_42_args *, DB_LSN *)); 56 57/* 58 * __ham_insdel_recover -- 59 * 60 * PUBLIC: int __ham_insdel_recover 61 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 62 */ 63int 64__ham_insdel_recover(env, dbtp, lsnp, op, info) 65 ENV *env; 66 DBT *dbtp; 67 DB_LSN *lsnp; 68 db_recops op; 69 void *info; 70{ 71 __ham_insdel_args *argp; 72 DB_THREAD_INFO *ip; 73 DB *file_dbp; 74 DBC *dbc; 75 DB_MPOOLFILE *mpf; 76 PAGE *pagep; 77 db_indx_t dindx; 78 u_int32_t opcode; 79 int cmp_n, cmp_p, dtype, ktype, ret; 80 81 ip = ((DB_TXNHEAD *)info)->thread_info; 82 pagep = NULL; 83 REC_PRINT(__ham_insdel_print); 84 REC_INTRO(__ham_insdel_read, ip, 1); 85 86 if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 87 0, &pagep)) != 0) { 88 if (DB_UNDO(op)) { 89 if (ret == DB_PAGE_NOTFOUND) 90 goto done; 91 else { 92 ret = __db_pgerr(file_dbp, argp->pgno, ret); 93 goto out; 94 } 95 } 96 /* If the page is not here then it was later truncated. */ 97 if (!IS_ZERO_LSN(argp->pagelsn)) 98 goto done; 99 /* 100 * This page was created by a group allocation and 101 * the file may not have been extend yet. 102 * Create the page if necessary. 103 */ 104 if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 105 DB_MPOOL_CREATE, &pagep)) != 0) { 106 ret = __db_pgerr(file_dbp, argp->pgno, ret); 107 goto out; 108 } 109 } 110 111 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 112 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 113 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 114 115 /* 116 * Two possible things going on: 117 * redo a delete/undo a put: delete the item from the page. 118 * redo a put/undo a delete: add the item to the page. 119 * If we are undoing a delete, then the information logged is the 120 * entire entry off the page, not just the data of a dbt. In 121 * this case, we want to copy it back onto the page verbatim. 122 * We do this by calling __insertpair with the type H_OFFPAGE instead 123 * of H_KEYDATA. 124 */ 125 opcode = OPCODE_OF(argp->opcode); 126 if ((opcode == DELPAIR && cmp_n == 0 && DB_UNDO(op)) || 127 (opcode == PUTPAIR && cmp_p == 0 && DB_REDO(op))) { 128 /* 129 * Need to redo a PUT or undo a delete. 130 */ 131 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 132 ktype = DB_UNDO(op) || PAIR_ISKEYBIG(argp->opcode) ? 133 H_OFFPAGE : H_KEYDATA; 134 if (PAIR_ISDATADUP(argp->opcode)) 135 dtype = H_DUPLICATE; 136 else if (DB_UNDO(op) || PAIR_ISDATABIG(argp->opcode)) 137 dtype = H_OFFPAGE; 138 else 139 dtype = H_KEYDATA; 140 dindx = (db_indx_t)argp->ndx; 141 if ((ret = __ham_insertpair(dbc, pagep, &dindx, 142 &argp->key, &argp->data, ktype, dtype)) != 0) 143 goto out; 144 LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; 145 } else if ((opcode == DELPAIR && cmp_p == 0 && DB_REDO(op)) || 146 (opcode == PUTPAIR && cmp_n == 0 && DB_UNDO(op))) { 147 /* Need to undo a put or redo a delete. */ 148 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 149 __ham_dpair(file_dbp, pagep, argp->ndx); 150 LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; 151 } 152 153 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 154 goto out; 155 pagep = NULL; 156 157 /* Return the previous LSN. */ 158done: *lsnp = argp->prev_lsn; 159 ret = 0; 160 161out: if (pagep != NULL) 162 (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); 163 REC_CLOSE; 164} 165 166/* 167 * __ham_newpage_recover -- 168 * This log message is used when we add/remove overflow pages. This 169 * message takes care of the pointer chains, not the data on the pages. 170 * 171 * PUBLIC: int __ham_newpage_recover 172 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 173 */ 174int 175__ham_newpage_recover(env, dbtp, lsnp, op, info) 176 ENV *env; 177 DBT *dbtp; 178 DB_LSN *lsnp; 179 db_recops op; 180 void *info; 181{ 182 __ham_newpage_args *argp; 183 DB_THREAD_INFO *ip; 184 DB *file_dbp; 185 DBC *dbc; 186 DB_MPOOLFILE *mpf; 187 PAGE *pagep; 188 int change, cmp_n, cmp_p, ret; 189 190 ip = ((DB_TXNHEAD *)info)->thread_info; 191 pagep = NULL; 192 REC_PRINT(__ham_newpage_print); 193 REC_INTRO(__ham_newpage_read, ip, 0); 194 195 REC_FGET(mpf, ip, argp->new_pgno, &pagep, ppage); 196 change = 0; 197 198 /* 199 * There are potentially three pages we need to check: the one 200 * that we created/deleted, the one before it and the one after 201 * it. 202 */ 203 204 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 205 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 206 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 207 208 if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) || 209 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) { 210 /* Redo a create new page or undo a delete new page. */ 211 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 212 P_INIT(pagep, file_dbp->pgsize, argp->new_pgno, 213 argp->prev_pgno, argp->next_pgno, 0, P_HASH); 214 change = 1; 215 } else if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DELOVFL) || 216 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) { 217 /* 218 * Redo a delete or undo a create new page. All we 219 * really need to do is change the LSN. 220 */ 221 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 222 change = 1; 223 } 224 225 if (change) 226 LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; 227 228 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 229 goto out; 230 pagep = NULL; 231 232 /* Now do the prev page. */ 233ppage: if (argp->prev_pgno != PGNO_INVALID) { 234 REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage); 235 236 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 237 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn); 238 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn); 239 change = 0; 240 241 if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) || 242 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) { 243 /* Redo a create new page or undo a delete new page. */ 244 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 245 pagep->next_pgno = argp->new_pgno; 246 change = 1; 247 } else if ((cmp_p == 0 && 248 DB_REDO(op) && argp->opcode == DELOVFL) || 249 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) { 250 /* Redo a delete or undo a create new page. */ 251 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 252 pagep->next_pgno = argp->next_pgno; 253 change = 1; 254 } 255 256 if (change) 257 LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn; 258 259 if ((ret = __memp_fput(mpf, 260 ip, pagep, file_dbp->priority)) != 0) 261 goto out; 262 pagep = NULL; 263 } 264 265 /* Now time to do the next page */ 266npage: if (argp->next_pgno != PGNO_INVALID) { 267 REC_FGET(mpf, ip, argp->next_pgno, &pagep, done); 268 269 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 270 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn); 271 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn); 272 change = 0; 273 274 if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) || 275 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) { 276 /* Redo a create new page or undo a delete new page. */ 277 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 278 pagep->prev_pgno = argp->new_pgno; 279 change = 1; 280 } else if ((cmp_p == 0 && 281 DB_REDO(op) && argp->opcode == DELOVFL) || 282 (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) { 283 /* Redo a delete or undo a create new page. */ 284 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 285 pagep->prev_pgno = argp->prev_pgno; 286 change = 1; 287 } 288 289 if (change) 290 LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn; 291 292 if ((ret = __memp_fput(mpf, 293 ip, pagep, file_dbp->priority)) != 0) 294 goto out; 295 pagep = NULL; 296 } 297done: *lsnp = argp->prev_lsn; 298 ret = 0; 299 300out: if (pagep != NULL) 301 (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); 302 REC_CLOSE; 303} 304 305/* 306 * __ham_replace_recover -- 307 * This log message refers to partial puts that are local to a single 308 * page. You can think of them as special cases of the more general 309 * insdel log message. 310 * 311 * PUBLIC: int __ham_replace_recover 312 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 313 */ 314int 315__ham_replace_recover(env, dbtp, lsnp, op, info) 316 ENV *env; 317 DBT *dbtp; 318 DB_LSN *lsnp; 319 db_recops op; 320 void *info; 321{ 322 __ham_replace_args *argp; 323 DB_THREAD_INFO *ip; 324 DB *file_dbp; 325 DBC *dbc; 326 DB_MPOOLFILE *mpf; 327 DBT dbt; 328 PAGE *pagep; 329 u_int32_t change; 330 int cmp_n, cmp_p, is_plus, modified, ret; 331 u_int8_t *hk; 332 333 ip = ((DB_TXNHEAD *)info)->thread_info; 334 pagep = NULL; 335 REC_PRINT(__ham_replace_print); 336 REC_INTRO(__ham_replace_read, ip, 0); 337 338 REC_FGET(mpf, ip, argp->pgno, &pagep, done); 339 340 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 341 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 342 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 343 344 memset(&dbt, 0, sizeof(dbt)); 345 modified = 0; 346 347 /* 348 * Before we know the direction of the transformation we will 349 * determine the size differential; then once we know if we are 350 * redoing or undoing, we'll adjust the sign (is_plus) appropriately. 351 */ 352 if (argp->newitem.size > argp->olditem.size) { 353 change = argp->newitem.size - argp->olditem.size; 354 is_plus = 1; 355 } else { 356 change = argp->olditem.size - argp->newitem.size; 357 is_plus = 0; 358 } 359 if (cmp_p == 0 && DB_REDO(op)) { 360 /* Reapply the change as specified. */ 361 dbt.data = argp->newitem.data; 362 dbt.size = argp->newitem.size; 363 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 364 LSN(pagep) = *lsnp; 365 /* 366 * The is_plus flag is set properly to reflect 367 * newitem.size - olditem.size. 368 */ 369 modified = 1; 370 } else if (cmp_n == 0 && DB_UNDO(op)) { 371 /* Undo the already applied change. */ 372 dbt.data = argp->olditem.data; 373 dbt.size = argp->olditem.size; 374 /* 375 * Invert is_plus to reflect sign of 376 * olditem.size - newitem.size. 377 */ 378 is_plus = !is_plus; 379 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 380 LSN(pagep) = argp->pagelsn; 381 modified = 1; 382 } 383 384 if (modified) { 385 __ham_onpage_replace(file_dbp, pagep, 386 argp->ndx, argp->off, change, is_plus, &dbt); 387 if (argp->makedup) { 388 hk = P_ENTRY(file_dbp, pagep, argp->ndx); 389 if (DB_REDO(op)) 390 HPAGE_PTYPE(hk) = H_DUPLICATE; 391 else 392 HPAGE_PTYPE(hk) = H_KEYDATA; 393 } 394 } 395 396 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 397 goto out; 398 pagep = NULL; 399 400done: *lsnp = argp->prev_lsn; 401 ret = 0; 402 403out: if (pagep != NULL) 404 (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); 405 REC_CLOSE; 406} 407 408/* 409 * __ham_splitdata_recover -- 410 * 411 * PUBLIC: int __ham_splitdata_recover 412 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 413 */ 414int 415__ham_splitdata_recover(env, dbtp, lsnp, op, info) 416 ENV *env; 417 DBT *dbtp; 418 DB_LSN *lsnp; 419 db_recops op; 420 void *info; 421{ 422 __ham_splitdata_args *argp; 423 DB_THREAD_INFO *ip; 424 DB *file_dbp; 425 DBC *dbc; 426 DB_MPOOLFILE *mpf; 427 PAGE *pagep; 428 int cmp_n, cmp_p, ret; 429 430 ip = ((DB_TXNHEAD *)info)->thread_info; 431 pagep = NULL; 432 REC_PRINT(__ham_splitdata_print); 433 REC_INTRO(__ham_splitdata_read, ip, 1); 434 435 if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { 436 if (DB_UNDO(op)) { 437 if (ret == DB_PAGE_NOTFOUND) 438 goto done; 439 else { 440 ret = __db_pgerr(file_dbp, argp->pgno, ret); 441 goto out; 442 } 443 } 444 /* If the page is not here then it was later truncated. */ 445 if (!IS_ZERO_LSN(argp->pagelsn)) 446 goto done; 447 /* 448 * This page was created by a group allocation and 449 * the file may not have been extend yet. 450 * Create the page if necessary. 451 */ 452 if ((ret = __memp_fget(mpf, &argp->pgno, 453 ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) { 454 ret = __db_pgerr(file_dbp, argp->pgno, ret); 455 goto out; 456 } 457 } 458 459 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 460 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 461 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 462 463 /* 464 * There are three types of log messages here. Two are related 465 * to an actual page split operation, one for the old page 466 * and one for the new pages created. The original image in the 467 * SPLITOLD record is used for undo. The image in the SPLITNEW 468 * is used for redo. We should never have a case where there is 469 * a redo operation and the SPLITOLD record is on disk, but not 470 * the SPLITNEW record. Therefore, we only have work to do when 471 * redo NEW messages and undo OLD messages, but we have to update 472 * LSNs in both cases. 473 * 474 * The third message is generated when a page is sorted (SORTPAGE). In 475 * an undo the original image in the SORTPAGE is used. In a redo we 476 * recreate the sort operation by calling __ham_sort_page. 477 */ 478 if (cmp_p == 0 && DB_REDO(op)) { 479 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 480 if (argp->opcode == SPLITNEW) 481 /* Need to redo the split described. */ 482 memcpy(pagep, argp->pageimage.data, 483 argp->pageimage.size); 484 else if (argp->opcode == SORTPAGE) { 485 if ((ret = __ham_sort_page(dbc, NULL, pagep)) != 0) 486 goto out; 487 } 488 LSN(pagep) = *lsnp; 489 } else if (cmp_n == 0 && DB_UNDO(op)) { 490 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 491 if (argp->opcode == SPLITOLD || argp->opcode == SORTPAGE) { 492 /* Put back the old image. */ 493 memcpy(pagep, argp->pageimage.data, 494 argp->pageimage.size); 495 } else 496 P_INIT(pagep, file_dbp->pgsize, argp->pgno, 497 PGNO_INVALID, PGNO_INVALID, 0, P_HASH); 498 LSN(pagep) = argp->pagelsn; 499 } 500 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 501 goto out; 502 pagep = NULL; 503 504done: *lsnp = argp->prev_lsn; 505 ret = 0; 506 507out: if (pagep != NULL) 508 (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); 509 REC_CLOSE; 510} 511 512/* 513 * __ham_copypage_recover -- 514 * Recovery function for copypage. 515 * 516 * PUBLIC: int __ham_copypage_recover 517 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 518 */ 519int 520__ham_copypage_recover(env, dbtp, lsnp, op, info) 521 ENV *env; 522 DBT *dbtp; 523 DB_LSN *lsnp; 524 db_recops op; 525 void *info; 526{ 527 __ham_copypage_args *argp; 528 DB_THREAD_INFO *ip; 529 DB *file_dbp; 530 DBC *dbc; 531 DB_MPOOLFILE *mpf; 532 PAGE *pagep; 533 int cmp_n, cmp_p, ret; 534 535 ip = ((DB_TXNHEAD *)info)->thread_info; 536 pagep = NULL; 537 REC_PRINT(__ham_copypage_print); 538 REC_INTRO(__ham_copypage_read, ip, 0); 539 540 /* This is the bucket page. */ 541 REC_FGET(mpf, ip, argp->pgno, &pagep, donext); 542 543 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 544 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 545 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 546 547 if (cmp_p == 0 && DB_REDO(op)) { 548 /* Need to redo update described. */ 549 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 550 memcpy(pagep, argp->page.data, argp->page.size); 551 PGNO(pagep) = argp->pgno; 552 PREV_PGNO(pagep) = PGNO_INVALID; 553 LSN(pagep) = *lsnp; 554 } else if (cmp_n == 0 && DB_UNDO(op)) { 555 /* Need to undo update described. */ 556 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 557 P_INIT(pagep, file_dbp->pgsize, argp->pgno, PGNO_INVALID, 558 argp->next_pgno, 0, P_HASH); 559 LSN(pagep) = argp->pagelsn; 560 } 561 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 562 goto out; 563 pagep = NULL; 564 565donext: /* Now fix up the "next" page. */ 566 REC_FGET(mpf, ip, argp->next_pgno, &pagep, do_nn); 567 568 /* For REDO just update the LSN. For UNDO copy page back. */ 569 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 570 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn); 571 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn); 572 if (cmp_p == 0 && DB_REDO(op)) { 573 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 574 LSN(pagep) = *lsnp; 575 } else if (cmp_n == 0 && DB_UNDO(op)) { 576 /* Need to undo update described. */ 577 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 578 memcpy(pagep, argp->page.data, argp->page.size); 579 } 580 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 581 goto out; 582 pagep = NULL; 583 584 /* Now fix up the next's next page. */ 585do_nn: if (argp->nnext_pgno == PGNO_INVALID) 586 goto done; 587 588 REC_FGET(mpf, ip, argp->nnext_pgno, &pagep, done); 589 590 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 591 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nnextlsn); 592 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nnextlsn); 593 594 if (cmp_p == 0 && DB_REDO(op)) { 595 /* Need to redo update described. */ 596 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 597 PREV_PGNO(pagep) = argp->pgno; 598 LSN(pagep) = *lsnp; 599 } else if (cmp_n == 0 && DB_UNDO(op)) { 600 /* Need to undo update described. */ 601 REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); 602 PREV_PGNO(pagep) = argp->next_pgno; 603 LSN(pagep) = argp->nnextlsn; 604 } 605 if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) 606 goto out; 607 pagep = NULL; 608 609done: *lsnp = argp->prev_lsn; 610 ret = 0; 611 612out: if (pagep != NULL) 613 (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); 614 REC_CLOSE; 615} 616 617/* 618 * __ham_metagroup_recover -- 619 * Recovery function for metagroup. 620 * 621 * PUBLIC: int __ham_metagroup_recover 622 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 623 */ 624int 625__ham_metagroup_recover(env, dbtp, lsnp, op, info) 626 ENV *env; 627 DBT *dbtp; 628 DB_LSN *lsnp; 629 db_recops op; 630 void *info; 631{ 632 __ham_metagroup_args *argp; 633 DB_THREAD_INFO *ip; 634 HASH_CURSOR *hcp; 635 DB *file_dbp; 636 DBMETA *mmeta; 637 DBC *dbc; 638 DB_MPOOLFILE *mpf; 639 PAGE *pagep; 640 db_pgno_t pgno; 641 int cmp_n, cmp_p, did_alloc, groupgrow, ret; 642 643 ip = ((DB_TXNHEAD *)info)->thread_info; 644 mmeta = NULL; 645 did_alloc = 0; 646 REC_PRINT(__ham_metagroup_print); 647 REC_INTRO(__ham_metagroup_read, ip, 1); 648 649 /* 650 * This logs the virtual create of pages pgno to pgno + bucket. 651 * The log record contains: 652 * bucket: old maximum bucket 653 * pgno: page number of the new bucket. 654 * We round up on log calculations, so we can figure out if we are 655 * about to double the hash table if argp->bucket+1 is a power of 2. 656 * If it is, then we are allocating an entire doubling of pages, 657 * otherwise, we are simply allocated one new page. 658 */ 659 groupgrow = 660 (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1; 661 pgno = argp->pgno; 662 if (argp->newalloc) 663 pgno += argp->bucket; 664 665 pagep = NULL; 666 ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep); 667 668 /* If we are undoing, then we don't want to create the page. */ 669 if (ret != 0 && DB_REDO(op)) 670 ret = __memp_fget(mpf, 671 &pgno, ip, NULL, DB_MPOOL_CREATE, &pagep); 672 else if (ret == DB_PAGE_NOTFOUND) 673 goto do_meta; 674 if (ret != 0) { 675 if (ret != ENOSPC) 676 goto out; 677 pgno = 0; 678 goto do_meta; 679 } 680 681 /* 682 * When we get here then either we did not grow the file 683 * (groupgrow == 0) or we did grow the file and the allocation 684 * of those new pages succeeded. 685 */ 686 did_alloc = groupgrow; 687 688 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 689 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 690 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 691 692 if (cmp_p == 0 && DB_REDO(op)) { 693 REC_DIRTY(mpf, ip, dbc->priority, &pagep); 694 pagep->lsn = *lsnp; 695 } else if (cmp_n == 0 && DB_UNDO(op)) { 696 /* If this record allocated the pages give them back. */ 697 if (argp->newalloc) { 698 if (pagep != NULL && (ret = __memp_fput(mpf, 699 ip, pagep, DB_PRIORITY_VERY_LOW)) != 0) 700 goto out; 701 pagep = NULL; 702 if ((ret = 703 __memp_ftruncate(mpf, ip, argp->pgno, 0)) != 0) 704 goto out; 705 } else { 706 /* 707 * Otherwise just roll the page back to its 708 * previous state. 709 */ 710 REC_DIRTY(mpf, ip, dbc->priority, &pagep); 711 pagep->lsn = argp->pagelsn; 712 } 713 } 714 if (pagep != NULL && 715 (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) 716 goto out; 717 718do_meta: 719 /* Now we have to update the meta-data page. */ 720 hcp = (HASH_CURSOR *)dbc->internal; 721 if ((ret = __ham_get_meta(dbc)) != 0) 722 goto out; 723 cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn); 724 cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn); 725 CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn); 726 if (cmp_p == 0 && DB_REDO(op)) { 727 /* Redo the actual updating of bucket counts. */ 728 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 729 ++hcp->hdr->max_bucket; 730 if (groupgrow) { 731 hcp->hdr->low_mask = hcp->hdr->high_mask; 732 hcp->hdr->high_mask = 733 (argp->bucket + 1) | hcp->hdr->low_mask; 734 } 735 hcp->hdr->dbmeta.lsn = *lsnp; 736 } else if (cmp_n == 0 && DB_UNDO(op)) { 737 /* Undo the actual updating of bucket counts. */ 738 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 739 hcp->hdr->max_bucket = argp->bucket; 740 if (groupgrow) { 741 hcp->hdr->high_mask = argp->bucket; 742 hcp->hdr->low_mask = hcp->hdr->high_mask >> 1; 743 } 744 hcp->hdr->dbmeta.lsn = argp->metalsn; 745 } 746 747 /* 748 * Now we need to fix up the spares array. Each entry in the 749 * spares array indicates the beginning page number for the 750 * indicated doubling. We need to fill this in whenever the 751 * spares array is invalid, if we never reclaim pages then 752 * we have to allocate the pages to the spares array in both 753 * the redo and undo cases. 754 */ 755 if (did_alloc && !DB_UNDO(op) && 756 hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == PGNO_INVALID) { 757 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 758 hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] = 759 (argp->pgno - argp->bucket) - 1; 760 } 761 if (cmp_n == 0 && groupgrow && DB_UNDO(op)) { 762 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 763 hcp->hdr->spares[ 764 __db_log2(argp->bucket + 1) + 1] = PGNO_INVALID; 765 } 766 767 /* 768 * Finally, we need to potentially fix up the last_pgno field 769 * in the master meta-data page (which may or may not be the 770 * same as the hash header page). 771 */ 772 if (argp->mmpgno != argp->mpgno) { 773 if ((ret = __memp_fget(mpf, 774 &argp->mmpgno, ip, NULL, DB_MPOOL_EDIT, &mmeta)) != 0) { 775 if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND) 776 ret = 0; 777 goto out; 778 } 779 cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn); 780 cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn); 781 if (cmp_p == 0 && DB_REDO(op)) { 782 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 783 mmeta->lsn = *lsnp; 784 } else if (cmp_n == 0 && DB_UNDO(op)) { 785 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 786 mmeta->lsn = argp->mmetalsn; 787 } 788 } else { 789 mmeta = (DBMETA *)hcp->hdr; 790 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 791 } 792 793 if (cmp_n == 0 && DB_UNDO(op)) 794 mmeta->last_pgno = argp->last_pgno; 795 else if (DB_REDO(op) && mmeta->last_pgno < pgno) 796 mmeta->last_pgno = pgno; 797 798 if (argp->mmpgno != argp->mpgno && 799 (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0) 800 goto out; 801 mmeta = NULL; 802 803done: *lsnp = argp->prev_lsn; 804 ret = 0; 805 806out: if (mmeta != NULL) 807 (void)__memp_fput(mpf, ip, mmeta, dbc->priority); 808 if (dbc != NULL) 809 (void)__ham_release_meta(dbc); 810 811 REC_CLOSE; 812} 813 814/* 815 * __ham_groupalloc_recover -- 816 * Recover the batch creation of a set of pages for a new database. 817 * 818 * PUBLIC: int __ham_groupalloc_recover 819 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 820 */ 821int 822__ham_groupalloc_recover(env, dbtp, lsnp, op, info) 823 ENV *env; 824 DBT *dbtp; 825 DB_LSN *lsnp; 826 db_recops op; 827 void *info; 828{ 829 __ham_groupalloc_args *argp; 830 DB_THREAD_INFO *ip; 831 DBMETA *mmeta; 832 DB_MPOOLFILE *mpf; 833 DB *file_dbp; 834 DBC *dbc; 835 PAGE *pagep; 836 db_pgno_t pgno; 837 int cmp_n, cmp_p, ret; 838 839 ip = ((DB_TXNHEAD *)info)->thread_info; 840 mmeta = NULL; 841 REC_PRINT(__ham_groupalloc_print); 842 REC_INTRO(__ham_groupalloc_read, ip, 1); 843 844 pgno = PGNO_BASE_MD; 845 if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) { 846 if (DB_REDO(op)) { 847 ret = __db_pgerr(file_dbp, pgno, ret); 848 goto out; 849 } else 850 goto done; 851 } 852 853 cmp_n = LOG_COMPARE(lsnp, &LSN(mmeta)); 854 cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn); 855 CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn); 856 857 /* 858 * Basically, we used mpool to allocate a chunk of pages. 859 * We need to either add those to a free list (in the undo 860 * case) or initialize them (in the redo case). 861 * 862 * If we are redoing and this is a hash subdatabase, it's possible 863 * that the pages were never allocated, so we'd better check for 864 * that and handle it here. 865 */ 866 pgno = argp->start_pgno + argp->num - 1; 867 if (DB_REDO(op)) { 868 if ((ret = __ham_alloc_pages(dbc, argp, lsnp)) != 0) 869 goto out; 870 if (cmp_p == 0) { 871 REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta); 872 LSN(mmeta) = *lsnp; 873 } 874 } else if (DB_UNDO(op)) { 875 /* 876 * Fetch the last page and determine if it is in 877 * the post allocation state. 878 */ 879 pagep = NULL; 880 if ((ret = __memp_fget(mpf, &pgno, 881 ip, NULL, DB_MPOOL_EDIT, &pagep)) == 0) { 882 if (LOG_COMPARE(&pagep->lsn, lsnp) != 0) { 883 if ((ret = __memp_fput(mpf, ip, 884 pagep, DB_PRIORITY_VERY_LOW)) != 0) 885 goto out; 886 pagep = NULL; 887 } 888 } else if (ret != DB_PAGE_NOTFOUND) 889 goto out; 890 /* 891 * If the last page was allocated then truncate back 892 * to the first page. 893 */ 894 if (pagep != NULL) { 895 if ((ret = __memp_fput(mpf, ip, 896 pagep, DB_PRIORITY_VERY_LOW)) != 0) 897 goto out; 898 if ((ret = __memp_ftruncate(mpf, 899 ip, argp->start_pgno, 0)) != 0) 900 goto out; 901 } 902 903 /* 904 * If we are rolling back the metapage, then make 905 * sure it reflects the the correct last_pgno. 906 */ 907 if (cmp_n == 0) { 908 REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta); 909 mmeta->last_pgno = argp->last_pgno; 910 } 911 pgno = 0; 912 if (cmp_n == 0) { 913 REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta); 914 LSN(mmeta) = argp->meta_lsn; 915 } 916 } 917 918 /* 919 * Set the last page number to the current value. 920 */ 921 if (pgno > mmeta->last_pgno) { 922 REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta); 923 mmeta->last_pgno = pgno; 924 } 925 926done: if (ret == 0) 927 *lsnp = argp->prev_lsn; 928 ret = 0; 929 930out: if (mmeta != NULL) 931 (void)__memp_fput(mpf, ip, mmeta, file_dbp->priority); 932 933 REC_CLOSE; 934} 935 936/* 937 * __ham_alloc_pages -- 938 * 939 * Called during redo of a file create. We create new pages in the file 940 * using the MPOOL_NEW_GROUP flag. We then log the meta-data page with a 941 * __crdel_metasub message. If we manage to crash without the newly written 942 * pages getting to disk (I'm not sure this can happen anywhere except our 943 * test suite?!), then we need to go through a recreate the final pages. 944 * Hash normally has holes in its files and handles them appropriately. 945 */ 946static int 947__ham_alloc_pages(dbc, argp, lsnp) 948 DBC *dbc; 949 __ham_groupalloc_args *argp; 950 DB_LSN *lsnp; 951{ 952 DB *file_dbp; 953 DB_MPOOLFILE *mpf; 954 DB_THREAD_INFO *ip; 955 PAGE *pagep; 956 db_pgno_t pgno; 957 int ret; 958 959 file_dbp = dbc->dbp; 960 mpf = file_dbp->mpf; 961 ip = dbc->thread_info; 962 963 /* Read the last page of the allocation. */ 964 pgno = argp->start_pgno + argp->num - 1; 965 966 /* If the page exists, and it has been initialized, then we're done. */ 967 if ((ret = 968 __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) == 0) { 969 if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn)) 970 goto reinit_page; 971 return (__memp_fput(mpf, ip, pagep, dbc->priority)); 972 } 973 974 /* Had to create the page. */ 975 if ((ret = __memp_fget(mpf, &pgno, 976 ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) 977 return (__db_pgerr(dbc->dbp, pgno, ret)); 978 979reinit_page: 980 /* Initialize the newly allocated page. */ 981 REC_DIRTY(mpf, ip, dbc->priority, &pagep); 982 P_INIT(pagep, dbc->dbp->pgsize, 983 pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH); 984 pagep->lsn = *lsnp; 985 986out: return (__memp_fput(mpf, ip, pagep, dbc->priority)); 987} 988 989/* 990 * __ham_curadj_recover -- 991 * Undo cursor adjustments if a subtransaction fails. 992 * 993 * PUBLIC: int __ham_curadj_recover 994 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 995 */ 996int 997__ham_curadj_recover(env, dbtp, lsnp, op, info) 998 ENV *env; 999 DBT *dbtp; 1000 DB_LSN *lsnp; 1001 db_recops op; 1002 void *info; 1003{ 1004 __ham_curadj_args *argp; 1005 db_ham_curadj mode, hamc_mode; 1006 DB_THREAD_INFO *ip; 1007 DB_MPOOLFILE *mpf; 1008 DB *file_dbp; 1009 DBC *dbc; 1010 HASH_CURSOR *hcp; 1011 int ret; 1012 1013 ip = ((DB_TXNHEAD *)info)->thread_info; 1014 REC_PRINT(__ham_curadj_print); 1015 REC_INTRO(__ham_curadj_read, ip, 1); 1016 1017 if (op != DB_TXN_ABORT) 1018 goto done; 1019 1020 mode = (db_ham_curadj)argp->add; 1021 1022 /* 1023 * Reverse the logged operation, so that the consequences are reversed 1024 * by the __hamc_update code. 1025 */ 1026 switch (mode) { 1027 case DB_HAM_CURADJ_DEL: 1028 hamc_mode = DB_HAM_CURADJ_ADD; 1029 break; 1030 case DB_HAM_CURADJ_ADD: 1031 hamc_mode = DB_HAM_CURADJ_DEL; 1032 break; 1033 case DB_HAM_CURADJ_ADDMOD: 1034 hamc_mode = DB_HAM_CURADJ_DELMOD; 1035 break; 1036 case DB_HAM_CURADJ_DELMOD: 1037 hamc_mode = DB_HAM_CURADJ_ADDMOD; 1038 break; 1039 default: 1040 __db_errx(env, 1041 "Invalid flag in __ham_curadj_recover"); 1042 ret = EINVAL; 1043 goto out; 1044 } 1045 1046 /* 1047 * Undo the adjustment by reinitializing the the cursor to look like 1048 * the one that was used to do the adjustment, then we invert the 1049 * add so that undo the adjustment. 1050 */ 1051 hcp = (HASH_CURSOR *)dbc->internal; 1052 hcp->pgno = argp->pgno; 1053 hcp->indx = argp->indx; 1054 hcp->dup_off = argp->dup_off; 1055 hcp->order = argp->order; 1056 if (mode == DB_HAM_CURADJ_DEL) 1057 F_SET(hcp, H_DELETED); 1058 (void)__hamc_update(dbc, argp->len, hamc_mode, argp->is_dup); 1059 1060done: *lsnp = argp->prev_lsn; 1061out: REC_CLOSE; 1062} 1063 1064/* 1065 * __ham_chgpg_recover -- 1066 * Undo cursor adjustments if a subtransaction fails. 1067 * 1068 * PUBLIC: int __ham_chgpg_recover 1069 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 1070 */ 1071int 1072__ham_chgpg_recover(env, dbtp, lsnp, op, info) 1073 ENV *env; 1074 DBT *dbtp; 1075 DB_LSN *lsnp; 1076 db_recops op; 1077 void *info; 1078{ 1079 __ham_chgpg_args *argp; 1080 DB_THREAD_INFO *ip; 1081 BTREE_CURSOR *opdcp; 1082 DB_MPOOLFILE *mpf; 1083 DB *file_dbp, *ldbp; 1084 DBC *dbc; 1085 DBC *cp; 1086 HASH_CURSOR *lcp; 1087 u_int32_t order, indx; 1088 int ret; 1089 1090 ip = ((DB_TXNHEAD *)info)->thread_info; 1091 REC_PRINT(__ham_chgpg_print); 1092 REC_INTRO(__ham_chgpg_read, ip, 0); 1093 1094 if (op != DB_TXN_ABORT) 1095 goto done; 1096 1097 /* Overloaded fields for DB_HAM_DEL*PG */ 1098 indx = argp->old_indx; 1099 order = argp->new_indx; 1100 1101 MUTEX_LOCK(env, env->mtx_dblist); 1102 FIND_FIRST_DB_MATCH(env, file_dbp, ldbp); 1103 for (; 1104 ldbp != NULL && ldbp->adj_fileid == file_dbp->adj_fileid; 1105 ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { 1106 MUTEX_LOCK(env, file_dbp->mutex); 1107 TAILQ_FOREACH(cp, &ldbp->active_queue, links) { 1108 lcp = (HASH_CURSOR *)cp->internal; 1109 1110 switch (argp->mode) { 1111 case DB_HAM_DELFIRSTPG: 1112 if (lcp->pgno != argp->new_pgno || 1113 MVCC_SKIP_CURADJ(cp, lcp->pgno)) 1114 break; 1115 if (lcp->indx != indx || 1116 !F_ISSET(lcp, H_DELETED) || 1117 lcp->order >= order) { 1118 lcp->pgno = argp->old_pgno; 1119 if (lcp->indx == indx) 1120 lcp->order -= order; 1121 } 1122 break; 1123 case DB_HAM_DELMIDPG: 1124 case DB_HAM_DELLASTPG: 1125 if (lcp->pgno == argp->new_pgno && 1126 lcp->indx == indx && 1127 F_ISSET(lcp, H_DELETED) && 1128 lcp->order >= order && 1129 !MVCC_SKIP_CURADJ(cp, lcp->pgno)) { 1130 lcp->pgno = argp->old_pgno; 1131 lcp->order -= order; 1132 lcp->indx = 0; 1133 } 1134 break; 1135 case DB_HAM_CHGPG: 1136 /* 1137 * If we're doing a CHGPG, we're undoing 1138 * the move of a non-deleted item to a 1139 * new page. Any cursors with the deleted 1140 * flag set do not belong to this item; 1141 * don't touch them. 1142 */ 1143 if (F_ISSET(lcp, H_DELETED)) 1144 break; 1145 /* FALLTHROUGH */ 1146 case DB_HAM_SPLIT: 1147 if (lcp->pgno == argp->new_pgno && 1148 lcp->indx == argp->new_indx && 1149 !MVCC_SKIP_CURADJ(cp, lcp->pgno)) { 1150 lcp->indx = argp->old_indx; 1151 lcp->pgno = argp->old_pgno; 1152 } 1153 break; 1154 case DB_HAM_DUP: 1155 if (lcp->opd == NULL) 1156 break; 1157 opdcp = (BTREE_CURSOR *)lcp->opd->internal; 1158 if (opdcp->pgno != argp->new_pgno || 1159 opdcp->indx != argp->new_indx || 1160 MVCC_SKIP_CURADJ(lcp->opd, opdcp->pgno)) 1161 break; 1162 1163 if (F_ISSET(opdcp, C_DELETED)) 1164 F_SET(lcp, H_DELETED); 1165 /* 1166 * We can't close a cursor while we have the 1167 * dbp mutex locked, since c_close reacquires 1168 * it. It should be safe to drop the mutex 1169 * here, though, since newly opened cursors 1170 * are put only at the end of the tailq and 1171 * the cursor we're adjusting can't be closed 1172 * under us. 1173 */ 1174 MUTEX_UNLOCK(env, file_dbp->mutex); 1175 if ((ret = __dbc_close(lcp->opd)) != 0) 1176 goto out; 1177 MUTEX_LOCK(env, file_dbp->mutex); 1178 lcp->opd = NULL; 1179 break; 1180 } 1181 } 1182 MUTEX_UNLOCK(env, file_dbp->mutex); 1183 } 1184 MUTEX_UNLOCK(env, env->mtx_dblist); 1185 1186done: *lsnp = argp->prev_lsn; 1187out: REC_CLOSE; 1188} 1189 1190/* 1191 * __ham_metagroup_recover -- 1192 * Recovery function for metagroup. 1193 * 1194 * PUBLIC: int __ham_metagroup_42_recover 1195 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 1196 */ 1197int 1198__ham_metagroup_42_recover(env, dbtp, lsnp, op, info) 1199 ENV *env; 1200 DBT *dbtp; 1201 DB_LSN *lsnp; 1202 db_recops op; 1203 void *info; 1204{ 1205 __ham_metagroup_42_args *argp; 1206 DB_THREAD_INFO *ip; 1207 HASH_CURSOR *hcp; 1208 DB *file_dbp; 1209 DBMETA *mmeta; 1210 DBC *dbc; 1211 DB_MPOOLFILE *mpf; 1212 PAGE *pagep; 1213 db_pgno_t pgno; 1214 u_int32_t flags; 1215 int cmp_n, cmp_p, did_alloc, groupgrow, ret; 1216 1217 ip = ((DB_TXNHEAD *)info)->thread_info; 1218 mmeta = NULL; 1219 did_alloc = 0; 1220 REC_PRINT(__ham_metagroup_42_print); 1221 REC_INTRO(__ham_metagroup_42_read, ip, 1); 1222 1223 /* 1224 * This logs the virtual create of pages pgno to pgno + bucket 1225 * If HAVE_FTRUNCATE is not supported the mpool page-allocation is not 1226 * transaction protected, we can never undo it. Even in an abort, 1227 * we have to allocate these pages to the hash table if they 1228 * were actually created. In particular, during disaster 1229 * recovery the metapage may be before this point if we 1230 * are rolling backward. If the file has not been extended 1231 * then the metapage could not have been updated. 1232 * The log record contains: 1233 * bucket: old maximum bucket 1234 * pgno: page number of the new bucket. 1235 * We round up on log calculations, so we can figure out if we are 1236 * about to double the hash table if argp->bucket+1 is a power of 2. 1237 * If it is, then we are allocating an entire doubling of pages, 1238 * otherwise, we are simply allocated one new page. 1239 */ 1240 groupgrow = 1241 (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1; 1242 pgno = argp->pgno; 1243 if (argp->newalloc) 1244 pgno += argp->bucket; 1245 1246 flags = 0; 1247 pagep = NULL; 1248 LF_SET(DB_MPOOL_CREATE); 1249 ret = __memp_fget(mpf, &pgno, ip, NULL, flags, &pagep); 1250 1251 if (ret != 0) { 1252 if (ret != ENOSPC) 1253 goto out; 1254 pgno = 0; 1255 goto do_meta; 1256 } 1257 1258 /* 1259 * When we get here then either we did not grow the file 1260 * (groupgrow == 0) or we did grow the file and the allocation 1261 * of those new pages succeeded. 1262 */ 1263 did_alloc = groupgrow; 1264 1265 cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); 1266 cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); 1267 CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); 1268 1269 if (cmp_p == 0 && DB_REDO(op)) { 1270 REC_DIRTY(mpf, ip, dbc->priority, &pagep); 1271 pagep->lsn = *lsnp; 1272 } else if (cmp_n == 0 && DB_UNDO(op)) { 1273 /* 1274 * Otherwise just roll the page back to its 1275 * previous state. 1276 */ 1277 REC_DIRTY(mpf, ip, dbc->priority, &pagep); 1278 pagep->lsn = argp->pagelsn; 1279 } 1280 if (pagep != NULL && 1281 (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) 1282 goto out; 1283 1284do_meta: 1285 /* Now we have to update the meta-data page. */ 1286 hcp = (HASH_CURSOR *)dbc->internal; 1287 if ((ret = __ham_get_meta(dbc)) != 0) 1288 goto out; 1289 cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn); 1290 cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn); 1291 CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn); 1292 if (cmp_p == 0 && DB_REDO(op)) { 1293 /* Redo the actual updating of bucket counts. */ 1294 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 1295 ++hcp->hdr->max_bucket; 1296 if (groupgrow) { 1297 hcp->hdr->low_mask = hcp->hdr->high_mask; 1298 hcp->hdr->high_mask = 1299 (argp->bucket + 1) | hcp->hdr->low_mask; 1300 } 1301 hcp->hdr->dbmeta.lsn = *lsnp; 1302 } else if (cmp_n == 0 && DB_UNDO(op)) { 1303 /* Undo the actual updating of bucket counts. */ 1304 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 1305 hcp->hdr->max_bucket = argp->bucket; 1306 if (groupgrow) { 1307 hcp->hdr->high_mask = argp->bucket; 1308 hcp->hdr->low_mask = hcp->hdr->high_mask >> 1; 1309 } 1310 hcp->hdr->dbmeta.lsn = argp->metalsn; 1311 } 1312 1313 /* 1314 * Now we need to fix up the spares array. Each entry in the 1315 * spares array indicates the beginning page number for the 1316 * indicated doubling. We need to fill this in whenever the 1317 * spares array is invalid, if we never reclaim pages then 1318 * we have to allocate the pages to the spares array in both 1319 * the redo and undo cases. 1320 */ 1321 if (did_alloc && 1322 hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == PGNO_INVALID) { 1323 REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr); 1324 hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] = 1325 (argp->pgno - argp->bucket) - 1; 1326 } 1327 1328 /* 1329 * Finally, we need to potentially fix up the last_pgno field 1330 * in the master meta-data page (which may or may not be the 1331 * same as the hash header page). 1332 */ 1333 if (argp->mmpgno != argp->mpgno) { 1334 if ((ret = __memp_fget(mpf, &argp->mmpgno, ip, NULL, 1335 DB_MPOOL_EDIT, &mmeta)) != 0) { 1336 if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND) 1337 ret = 0; 1338 goto out; 1339 } 1340 cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn); 1341 cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn); 1342 if (cmp_p == 0 && DB_REDO(op)) { 1343 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 1344 mmeta->lsn = *lsnp; 1345 } else if (cmp_n == 0 && DB_UNDO(op)) { 1346 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 1347 mmeta->lsn = argp->mmetalsn; 1348 } 1349 } else { 1350 mmeta = (DBMETA *)hcp->hdr; 1351 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 1352 } 1353 1354 if (mmeta->last_pgno < pgno) 1355 mmeta->last_pgno = pgno; 1356 1357 if (argp->mmpgno != argp->mpgno && 1358 (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0) 1359 goto out; 1360 mmeta = NULL; 1361 1362done: *lsnp = argp->prev_lsn; 1363 ret = 0; 1364 1365out: if (mmeta != NULL) 1366 (void)__memp_fput(mpf, ip, mmeta, dbc->priority); 1367 if (dbc != NULL) 1368 (void)__ham_release_meta(dbc); 1369 1370 REC_CLOSE; 1371} 1372 1373/* 1374 * __ham_groupalloc_42_recover -- 1375 * Recover the batch creation of a set of pages for a new database. 1376 * 1377 * PUBLIC: int __ham_groupalloc_42_recover 1378 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 1379 */ 1380int 1381__ham_groupalloc_42_recover(env, dbtp, lsnp, op, info) 1382 ENV *env; 1383 DBT *dbtp; 1384 DB_LSN *lsnp; 1385 db_recops op; 1386 void *info; 1387{ 1388 __ham_groupalloc_42_args *argp; 1389 DB_THREAD_INFO *ip; 1390 DBMETA *mmeta; 1391 DB_MPOOLFILE *mpf; 1392 DB *file_dbp; 1393 DBC *dbc; 1394 db_pgno_t pgno; 1395 int cmp_p, ret; 1396 1397 ip = ((DB_TXNHEAD *)info)->thread_info; 1398 mmeta = NULL; 1399 REC_PRINT(__ham_groupalloc_42_print); 1400 REC_INTRO(__ham_groupalloc_42_read, ip, 1); 1401 1402 pgno = PGNO_BASE_MD; 1403 if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) { 1404 if (DB_REDO(op)) { 1405 ret = __db_pgerr(file_dbp, pgno, ret); 1406 goto out; 1407 } else 1408 goto done; 1409 } 1410 1411 cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn); 1412 CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn); 1413 1414 /* 1415 * Basically, we used mpool to allocate a chunk of pages. 1416 * We need to either add those to a free list (in the undo 1417 * case) or initialize them (in the redo case). 1418 * 1419 * If we are redoing and this is a hash subdatabase, it's possible 1420 * that the pages were never allocated, so we'd better check for 1421 * that and handle it here. 1422 */ 1423 pgno = argp->start_pgno + argp->num - 1; 1424 if (DB_REDO(op)) { 1425 if ((ret = __ham_alloc_pages_42(dbc, argp, lsnp)) != 0) 1426 goto out; 1427 if (cmp_p == 0) { 1428 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 1429 LSN(mmeta) = *lsnp; 1430 } 1431 } else if (DB_UNDO(op)) { 1432 /* 1433 * We cannot roll back 4.2 style allocations. 1434 */ 1435 __db_errx(env, 1436"Cannot replicate prepared transactions from master running release 4.2."); 1437 ret = __env_panic(env, EINVAL); 1438 goto out; 1439 } 1440 1441 /* 1442 * In both REDO and UNDO, we have grown the file and need to make 1443 * sure that last_pgno is correct. If we HAVE_FTRUNCATE pgno 1444 * will only be valid on REDO. 1445 */ 1446 if (pgno > mmeta->last_pgno) { 1447 REC_DIRTY(mpf, ip, dbc->priority, &mmeta); 1448 mmeta->last_pgno = pgno; 1449 } 1450 1451done: if (ret == 0) 1452 *lsnp = argp->prev_lsn; 1453 ret = 0; 1454 1455out: if (mmeta != NULL) 1456 (void)__memp_fput(mpf, ip, mmeta, dbc->priority); 1457 1458 REC_CLOSE; 1459} 1460 1461/* 1462 * __ham_alloc_pages_42 -- 1463 * 1464 * Called during redo of a file create. We create new pages in the file 1465 * using the MPOOL_NEW_GROUP flag. We then log the meta-data page with a 1466 * __crdel_metasub message. If we manage to crash without the newly written 1467 * pages getting to disk (I'm not sure this can happen anywhere except our 1468 * test suite?!), then we need to go through a recreate the final pages. 1469 * Hash normally has holes in its files and handles them appropriately. 1470 */ 1471static int 1472__ham_alloc_pages_42(dbc, argp, lsnp) 1473 DBC *dbc; 1474 __ham_groupalloc_42_args *argp; 1475 DB_LSN *lsnp; 1476{ 1477 DB_MPOOLFILE *mpf; 1478 DB_THREAD_INFO *ip; 1479 PAGE *pagep; 1480 db_pgno_t pgno; 1481 int ret; 1482 1483 mpf = dbc->dbp->mpf; 1484 ip = dbc->thread_info; 1485 1486 /* Read the last page of the allocation. */ 1487 pgno = argp->start_pgno + argp->num - 1; 1488 1489 /* If the page exists, and it has been initialized, then we're done. */ 1490 if ((ret = __memp_fget(mpf, 1491 &pgno, ip, NULL, 0, &pagep)) == 0) { 1492 if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn)) 1493 goto reinit_page; 1494 if ((ret = __memp_fput(mpf, 1495 ip, pagep, dbc->priority)) != 0) 1496 return (ret); 1497 return (0); 1498 } 1499 1500 /* Had to create the page. */ 1501 if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 1502 DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) != 0) 1503 return (__db_pgerr(dbc->dbp, pgno, ret)); 1504 1505reinit_page: 1506 /* Initialize the newly allocated page. */ 1507 P_INIT(pagep, 1508 dbc->dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH); 1509 pagep->lsn = *lsnp; 1510 1511 if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) 1512 return (ret); 1513 1514 return (0); 1515} 1516