1# See the file LICENSE for redistribution information. 2# 3# Copyright (c) 2001-2009 Oracle. All rights reserved. 4# 5# $Id$ 6# 7# Replication testing utilities 8 9# Environment handle for the env containing the replication "communications 10# structure" (really a CDB environment). 11 12# The test environment consists of a queue and a # directory (environment) 13# per replication site. The queue is used to hold messages destined for a 14# particular site and the directory will contain the environment for the 15# site. So the environment looks like: 16# $testdir 17# ___________|______________________________ 18# / | \ \ 19# MSGQUEUEDIR MASTERDIR CLIENTDIR.0 ... CLIENTDIR.N-1 20# | | ... | 21# 1 2 .. N+1 22# 23# The master is site 1 in the MSGQUEUEDIR and clients 1-N map to message 24# queues 2 - N+1. 25# 26# The globals repenv(1-N) contain the environment handles for the sites 27# with a given id (i.e., repenv(1) is the master's environment. 28 29 30# queuedbs is an array of DB handles, one per machine ID/machine ID pair, 31# for the databases that contain messages from one machine to another. 32# We omit the cases where the "from" and "to" machines are the same. 33# Since tcl does not have real two-dimensional arrays, we use this 34# naming convention: queuedbs(1.2) has the handle for the database 35# containing messages to machid 1 from machid 2. 36# 37global queuedbs 38global machids 39global perm_response_list 40set perm_response_list {} 41global perm_sent_list 42set perm_sent_list {} 43global elect_timeout 44unset -nocomplain elect_timeout 45set elect_timeout(default) 5000000 46global electable_pri 47set electable_pri 5 48set drop 0 49global anywhere 50set anywhere 0 51 52global rep_verbose 53set rep_verbose 0 54global verbose_type 55set verbose_type "rep" 56 57# To run a replication test with verbose messages, type 58# 'run_verbose' and then the usual test command string enclosed 59# in double quotes or curly braces. For example: 60# 61# run_verbose "rep001 btree" 62# 63# run_verbose {run_repmethod btree test001} 64# 65# To run a replication test with one of the subsets of verbose 66# messages, use the same syntax with 'run_verbose_elect', 67# 'run_verbose_lease', etc. 68 69proc run_verbose { commandstring } { 70 global verbose_type 71 set verbose_type "rep" 72 run_verb $commandstring 73} 74 75proc run_verbose_elect { commandstring } { 76 global verbose_type 77 set verbose_type "rep_elect" 78 run_verb $commandstring 79} 80 81proc run_verbose_lease { commandstring } { 82 global verbose_type 83 set verbose_type "rep_lease" 84 run_verb $commandstring 85} 86 87proc run_verbose_misc { commandstring } { 88 global verbose_type 89 set verbose_type "rep_misc" 90 run_verb $commandstring 91} 92 93proc run_verbose_msgs { commandstring } { 94 global verbose_type 95 set verbose_type "rep_msgs" 96 run_verb $commandstring 97} 98 99proc run_verbose_sync { commandstring } { 100 global verbose_type 101 set verbose_type "rep_sync" 102 run_verb $commandstring 103} 104 105proc run_verbose_test { commandstring } { 106 global verbose_type 107 set verbose_type "rep_test" 108 run_verb $commandstring 109} 110 111proc run_verbose_repmgr_misc { commandstring } { 112 global verbose_type 113 set verbose_type "repmgr_misc" 114 run_verb $commandstring 115} 116 117proc run_verb { commandstring } { 118 global rep_verbose 119 global verbose_type 120 121 set rep_verbose 1 122 if { [catch { 123 eval $commandstring 124 flush stdout 125 flush stderr 126 } res] != 0 } { 127 global errorInfo 128 129 set rep_verbose 0 130 set fnl [string first "\n" $errorInfo] 131 set theError [string range $errorInfo 0 [expr $fnl - 1]] 132 if {[string first FAIL $errorInfo] == -1} { 133 error "FAIL:[timestamp]\ 134 run_verbose: $commandstring: $theError" 135 } else { 136 error $theError; 137 } 138 } 139 set rep_verbose 0 140} 141 142# Databases are on-disk by default for replication testing. 143# Some replication tests have been converted to run with databases 144# in memory instead. 145 146global databases_in_memory 147set databases_in_memory 0 148 149proc run_inmem_db { test method } { 150 run_inmem $test $method 1 0 0 0 151} 152 153# Replication files are on-disk by default for replication testing. 154# Some replication tests have been converted to run with rep files 155# in memory instead. 156 157global repfiles_in_memory 158set repfiles_in_memory 0 159 160proc run_inmem_rep { test method } { 161 run_inmem $test $method 0 0 1 0 162} 163 164# Region files are on-disk by default for replication testing. 165# Replication tests can force the region files in-memory by setting 166# the -private flag when opening an env. 167 168global env_private 169set env_private 0 170 171proc run_env_private { test method } { 172 run_inmem $test $method 0 0 0 1 173} 174 175# Logs are on-disk by default for replication testing. 176# Mixed-mode log testing provides a mixture of on-disk and 177# in-memory logging, or even all in-memory. When testing on a 178# 1-master/1-client test, we try all four options. On a test 179# with more clients, we still try four options, randomly 180# selecting whether the later clients are on-disk or in-memory. 181# 182 183global mixed_mode_logging 184set mixed_mode_logging 0 185 186proc create_logsets { nsites } { 187 global mixed_mode_logging 188 global logsets 189 global rand_init 190 191 error_check_good set_random_seed [berkdb srand $rand_init] 0 192 if { $mixed_mode_logging == 0 || $mixed_mode_logging == 2 } { 193 if { $mixed_mode_logging == 0 } { 194 set logmode "on-disk" 195 } else { 196 set logmode "in-memory" 197 } 198 set loglist {} 199 for { set i 0 } { $i < $nsites } { incr i } { 200 lappend loglist $logmode 201 } 202 set logsets [list $loglist] 203 } 204 if { $mixed_mode_logging == 1 } { 205 set set1 {on-disk on-disk} 206 set set2 {on-disk in-memory} 207 set set3 {in-memory on-disk} 208 set set4 {in-memory in-memory} 209 210 # Start with nsites at 2 since we already set up 211 # the master and first client. 212 for { set i 2 } { $i < $nsites } { incr i } { 213 foreach set { set1 set2 set3 set4 } { 214 if { [berkdb random_int 0 1] == 0 } { 215 lappend $set "on-disk" 216 } else { 217 lappend $set "in-memory" 218 } 219 } 220 } 221 set logsets [list $set1 $set2 $set3 $set4] 222 } 223 return $logsets 224} 225 226proc run_inmem_log { test method } { 227 run_inmem $test $method 0 1 0 0 228} 229 230# Run_mixedmode_log is a little different from the other run_inmem procs: 231# it provides a mixture of in-memory and on-disk logging on the different 232# hosts in a replication group. 233proc run_mixedmode_log { test method {display 0} {run 1} \ 234 {outfile stdout} {largs ""} } { 235 global mixed_mode_logging 236 set mixed_mode_logging 1 237 238 set prefix [string range $test 0 2] 239 if { $prefix != "rep" } { 240 puts "Skipping mixed-mode log testing for non-rep test." 241 set mixed_mode_logging 0 242 return 243 } 244 245 eval run_method $method $test $display $run $outfile $largs 246 247 # Reset to default values after run. 248 set mixed_mode_logging 0 249} 250 251# The procs run_inmem_db, run_inmem_log, run_inmem_rep, and run_env_private 252# put databases, logs, rep files, or region files in-memory. (Setting up 253# an env with the -private flag puts region files in memory.) 254# The proc run_inmem allows you to put any or all of these in-memory 255# at the same time. 256 257proc run_inmem { test method\ 258 {dbinmem 1} {logsinmem 1} {repinmem 1} {envprivate 1} } { 259 260 set prefix [string range $test 0 2] 261 if { $prefix != "rep" } { 262 puts "Skipping in-memory testing for non-rep test." 263 return 264 } 265 global databases_in_memory 266 global mixed_mode_logging 267 global repfiles_in_memory 268 global env_private 269 global test_names 270 271 if { $dbinmem } { 272 if { [is_substr $test_names(rep_inmem) $test] == 0 } { 273 puts "Test $test does not support in-memory databases." 274 puts "Putting databases on-disk." 275 set databases_in_memory 0 276 } else { 277 set databases_in_memory 1 278 } 279 } 280 if { $logsinmem } { 281 set mixed_mode_logging 2 282 } 283 if { $repinmem } { 284 set repfiles_in_memory 1 285 } 286 if { $envprivate } { 287 set env_private 1 288 } 289 290 if { [catch {eval run_method $method $test} res] } { 291 set databases_in_memory 0 292 set mixed_mode_logging 0 293 set repfiles_in_memory 0 294 set env_private 0 295 puts "FAIL: $res" 296 } 297 298 set databases_in_memory 0 299 set mixed_mode_logging 0 300 set repfiles_in_memory 0 301 set env_private 0 302} 303 304# The proc run_diskless runs run_inmem with its default values. 305# It's useful to have this name to remind us of its testing purpose, 306# which is to mimic a diskless host. 307 308proc run_diskless { test method } { 309 run_inmem $test $method 1 1 1 1 310} 311 312# Open the master and client environments; store these in the global repenv 313# Return the master's environment: "-env masterenv" 314proc repl_envsetup { envargs largs test {nclients 1} {droppct 0} { oob 0 } } { 315 source ./include.tcl 316 global clientdir 317 global drop drop_msg 318 global masterdir 319 global repenv 320 global rep_verbose 321 global verbose_type 322 323 set verbargs "" 324 if { $rep_verbose == 1 } { 325 set verbargs " -verbose {$verbose_type on}" 326 } 327 328 env_cleanup $testdir 329 330 replsetup $testdir/MSGQUEUEDIR 331 332 set masterdir $testdir/MASTERDIR 333 file mkdir $masterdir 334 if { $droppct != 0 } { 335 set drop 1 336 set drop_msg [expr 100 / $droppct] 337 } else { 338 set drop 0 339 } 340 341 for { set i 0 } { $i < $nclients } { incr i } { 342 set clientdir($i) $testdir/CLIENTDIR.$i 343 file mkdir $clientdir($i) 344 } 345 346 # Open a master. 347 repladd 1 348 # 349 # Set log smaller than default to force changing files, 350 # but big enough so that the tests that use binary files 351 # as keys/data can run. Increase the size of the log region -- 352 # sdb004 needs this, now that subdatabase names are stored 353 # in the env region. 354 # 355 set logmax [expr 3 * 1024 * 1024] 356 set lockmax 40000 357 set logregion 2097152 358 359 set ma_cmd "berkdb_env_noerr -create -log_max $logmax $envargs \ 360 -cachesize { 0 4194304 1 } -log_regionmax $logregion \ 361 -lock_max_objects $lockmax -lock_max_locks $lockmax \ 362 -errpfx $masterdir $verbargs \ 363 -home $masterdir -txn nosync -rep_master -rep_transport \ 364 \[list 1 replsend\]" 365 set masterenv [eval $ma_cmd] 366 error_check_good master_env [is_valid_env $masterenv] TRUE 367 set repenv(master) $masterenv 368 369 # Open clients 370 for { set i 0 } { $i < $nclients } { incr i } { 371 set envid [expr $i + 2] 372 repladd $envid 373 set cl_cmd "berkdb_env_noerr -create $envargs -txn nosync \ 374 -cachesize { 0 10000000 0 } -log_regionmax $logregion \ 375 -lock_max_objects $lockmax -lock_max_locks $lockmax \ 376 -errpfx $clientdir($i) $verbargs \ 377 -home $clientdir($i) -rep_client -rep_transport \ 378 \[list $envid replsend\]" 379 set clientenv [eval $cl_cmd] 380 error_check_good client_env [is_valid_env $clientenv] TRUE 381 set repenv($i) $clientenv 382 } 383 set repenv($i) NULL 384 append largs " -env $masterenv " 385 386 # Process startup messages 387 repl_envprocq $test $nclients $oob 388 389 # Clobber replication's 30-second anti-archive timer, which 390 # will have been started by client sync-up internal init, in 391 # case the test we're about to run wants to do any log 392 # archiving, or database renaming and/or removal. 393 $masterenv test force noarchive_timeout 394 395 return $largs 396} 397 398# Process all incoming messages. Iterate until there are no messages left 399# in anyone's queue so that we capture all message exchanges. We verify that 400# the requested number of clients matches the number of client environments 401# we have. The oob parameter indicates if we should process the queue 402# with out-of-order delivery. The replprocess procedure actually does 403# the real work of processing the queue -- this routine simply iterates 404# over the various queues and does the initial setup. 405proc repl_envprocq { test { nclients 1 } { oob 0 }} { 406 global repenv 407 global drop 408 409 set masterenv $repenv(master) 410 for { set i 0 } { 1 } { incr i } { 411 if { $repenv($i) == "NULL"} { 412 break 413 } 414 } 415 error_check_good i_nclients $nclients $i 416 417 berkdb debug_check 418 puts -nonewline "\t$test: Processing master/$i client queues" 419 set rand_skip 0 420 if { $oob } { 421 puts " out-of-order" 422 } else { 423 puts " in order" 424 } 425 set droprestore $drop 426 while { 1 } { 427 set nproced 0 428 429 if { $oob } { 430 set rand_skip [berkdb random_int 2 10] 431 } 432 incr nproced [replprocessqueue $masterenv 1 $rand_skip] 433 for { set i 0 } { $i < $nclients } { incr i } { 434 set envid [expr $i + 2] 435 if { $oob } { 436 set rand_skip [berkdb random_int 2 10] 437 } 438 set n [replprocessqueue $repenv($i) \ 439 $envid $rand_skip] 440 incr nproced $n 441 } 442 443 if { $nproced == 0 } { 444 # Now that we delay requesting records until 445 # we've had a few records go by, we should always 446 # see that the number of requests is lower than the 447 # number of messages that were enqueued. 448 for { set i 0 } { $i < $nclients } { incr i } { 449 set clientenv $repenv($i) 450 set queued [stat_field $clientenv rep_stat \ 451 "Total log records queued"] 452 error_check_bad queued_stats \ 453 $queued -1 454 set requested [stat_field $clientenv rep_stat \ 455 "Log records requested"] 456 error_check_bad requested_stats \ 457 $requested -1 458 459 # 460 # Set to 100 usecs. An average ping 461 # to localhost should be a few 10s usecs. 462 # 463 $clientenv rep_request 100 400 464 } 465 466 # If we were dropping messages, we might need 467 # to flush the log so that we get everything 468 # and end up in the right state. 469 if { $drop != 0 } { 470 set drop 0 471 $masterenv rep_flush 472 berkdb debug_check 473 puts "\t$test: Flushing Master" 474 } else { 475 break 476 } 477 } 478 } 479 480 # Reset the clients back to the default state in case we 481 # have more processing to do. 482 for { set i 0 } { $i < $nclients } { incr i } { 483 set clientenv $repenv($i) 484 $clientenv rep_request 40000 1280000 485 } 486 set drop $droprestore 487} 488 489# Verify that the directories in the master are exactly replicated in 490# each of the client environments. 491proc repl_envver0 { test method { nclients 1 } } { 492 global clientdir 493 global masterdir 494 global repenv 495 496 # Verify the database in the client dir. 497 # First dump the master. 498 set t1 $masterdir/t1 499 set t2 $masterdir/t2 500 set t3 $masterdir/t3 501 set omethod [convert_method $method] 502 503 # 504 # We are interested in the keys of whatever databases are present 505 # in the master environment, so we just call a no-op check function 506 # since we have no idea what the contents of this database really is. 507 # We just need to walk the master and the clients and make sure they 508 # have the same contents. 509 # 510 set cwd [pwd] 511 cd $masterdir 512 set stat [catch {glob test*.db} dbs] 513 cd $cwd 514 if { $stat == 1 } { 515 return 516 } 517 foreach testfile $dbs { 518 open_and_dump_file $testfile $repenv(master) $masterdir/t2 \ 519 repl_noop dump_file_direction "-first" "-next" 520 521 if { [string compare [convert_method $method] -recno] != 0 } { 522 filesort $t2 $t3 523 file rename -force $t3 $t2 524 } 525 for { set i 0 } { $i < $nclients } { incr i } { 526 puts "\t$test: Verifying client $i database $testfile contents." 527 open_and_dump_file $testfile $repenv($i) \ 528 $t1 repl_noop dump_file_direction "-first" "-next" 529 530 if { [string compare $omethod "-recno"] != 0 } { 531 filesort $t1 $t3 532 } else { 533 catch {file copy -force $t1 $t3} ret 534 } 535 error_check_good diff_files($t2,$t3) [filecmp $t2 $t3] 0 536 } 537 } 538} 539 540# Remove all the elements from the master and verify that these 541# deletions properly propagated to the clients. 542proc repl_verdel { test method { nclients 1 } } { 543 global clientdir 544 global masterdir 545 global repenv 546 547 # Delete all items in the master. 548 set cwd [pwd] 549 cd $masterdir 550 set stat [catch {glob test*.db} dbs] 551 cd $cwd 552 if { $stat == 1 } { 553 return 554 } 555 foreach testfile $dbs { 556 puts "\t$test: Deleting all items from the master." 557 set txn [$repenv(master) txn] 558 error_check_good txn_begin [is_valid_txn $txn \ 559 $repenv(master)] TRUE 560 set db [eval berkdb_open -txn $txn -env $repenv(master) \ 561 $testfile] 562 error_check_good reopen_master [is_valid_db $db] TRUE 563 set dbc [$db cursor -txn $txn] 564 error_check_good reopen_master_cursor \ 565 [is_valid_cursor $dbc $db] TRUE 566 for { set dbt [$dbc get -first] } { [llength $dbt] > 0 } \ 567 { set dbt [$dbc get -next] } { 568 error_check_good del_item [$dbc del] 0 569 } 570 error_check_good dbc_close [$dbc close] 0 571 error_check_good txn_commit [$txn commit] 0 572 error_check_good db_close [$db close] 0 573 574 repl_envprocq $test $nclients 575 576 # Check clients. 577 for { set i 0 } { $i < $nclients } { incr i } { 578 puts "\t$test: Verifying client database $i is empty." 579 580 set db [eval berkdb_open -env $repenv($i) $testfile] 581 error_check_good reopen_client($i) \ 582 [is_valid_db $db] TRUE 583 set dbc [$db cursor] 584 error_check_good reopen_client_cursor($i) \ 585 [is_valid_cursor $dbc $db] TRUE 586 587 error_check_good client($i)_empty \ 588 [llength [$dbc get -first]] 0 589 590 error_check_good dbc_close [$dbc close] 0 591 error_check_good db_close [$db close] 0 592 } 593 } 594} 595 596# Replication "check" function for the dump procs that expect to 597# be able to verify the keys and data. 598proc repl_noop { k d } { 599 return 600} 601 602# Close all the master and client environments in a replication test directory. 603proc repl_envclose { test envargs } { 604 source ./include.tcl 605 global clientdir 606 global encrypt 607 global masterdir 608 global repenv 609 global drop 610 611 if { [lsearch $envargs "-encrypta*"] !=-1 } { 612 set encrypt 1 613 } 614 615 # In order to make sure that we have fully-synced and ready-to-verify 616 # databases on all the clients, do a checkpoint on the master and 617 # process messages in order to flush all the clients. 618 set drop 0 619 berkdb debug_check 620 puts "\t$test: Checkpointing master." 621 error_check_good masterenv_ckp [$repenv(master) txn_checkpoint] 0 622 623 # Count clients. 624 for { set ncli 0 } { 1 } { incr ncli } { 625 if { $repenv($ncli) == "NULL" } { 626 break 627 } 628 $repenv($ncli) rep_request 100 100 629 } 630 repl_envprocq $test $ncli 631 632 error_check_good masterenv_close [$repenv(master) close] 0 633 verify_dir $masterdir "\t$test: " 0 0 1 634 for { set i 0 } { $i < $ncli } { incr i } { 635 error_check_good client($i)_close [$repenv($i) close] 0 636 verify_dir $clientdir($i) "\t$test: " 0 0 1 637 } 638 replclose $testdir/MSGQUEUEDIR 639 640} 641 642# Replnoop is a dummy function to substitute for replsend 643# when replication is off. 644proc replnoop { control rec fromid toid flags lsn } { 645 return 0 646} 647 648proc replclose { queuedir } { 649 global queueenv queuedbs machids 650 651 foreach m $machids { 652 set db $queuedbs($m) 653 error_check_good dbr_close [$db close] 0 654 } 655 error_check_good qenv_close [$queueenv close] 0 656 set machids {} 657} 658 659# Create a replication group for testing. 660proc replsetup { queuedir } { 661 global queueenv queuedbs machids 662 663 file mkdir $queuedir 664 set max_locks 20000 665 set queueenv [berkdb_env \ 666 -create -txn nosync -lock_max_locks $max_locks -home $queuedir] 667 error_check_good queueenv [is_valid_env $queueenv] TRUE 668 669 if { [info exists queuedbs] } { 670 unset queuedbs 671 } 672 set machids {} 673 674 return $queueenv 675} 676 677# Send function for replication. 678proc replsend { control rec fromid toid flags lsn } { 679 global queuedbs queueenv machids 680 global drop drop_msg 681 global perm_sent_list 682 global anywhere 683 684 set permflags [lsearch $flags "perm"] 685 if { [llength $perm_sent_list] != 0 && $permflags != -1 } { 686# puts "replsend sent perm message, LSN $lsn" 687 lappend perm_sent_list $lsn 688 } 689 690 # 691 # If we are testing with dropped messages, then we drop every 692 # $drop_msg time. If we do that just return 0 and don't do 693 # anything. 694 # 695 if { $drop != 0 } { 696 incr drop 697 if { $drop == $drop_msg } { 698 set drop 1 699 return 0 700 } 701 } 702 # XXX 703 # -1 is DB_BROADCAST_EID 704 if { $toid == -1 } { 705 set machlist $machids 706 } else { 707 if { [info exists queuedbs($toid)] != 1 } { 708 error "replsend: machid $toid not found" 709 } 710 set m NULL 711 if { $anywhere != 0 } { 712 # 713 # If we can send this anywhere, send it to the first 714 # id we find that is neither toid or fromid. 715 # 716 set anyflags [lsearch $flags "any"] 717 if { $anyflags != -1 } { 718 foreach m $machids { 719 if { $m == $fromid || $m == $toid } { 720 continue 721 } 722 set machlist [list $m] 723 break 724 } 725 } 726 } 727 # 728 # If we didn't find a different site, then we must 729 # fallback to the toid. 730 # 731 if { $m == "NULL" } { 732 set machlist [list $toid] 733 } 734 } 735 736 foreach m $machlist { 737 # do not broadcast to self. 738 if { $m == $fromid } { 739 continue 740 } 741 742 set db $queuedbs($m) 743 set txn [$queueenv txn] 744 $db put -txn $txn -append [list $control $rec $fromid] 745 error_check_good replsend_commit [$txn commit] 0 746 } 747 748 queue_logcheck 749 return 0 750} 751 752# 753# If the message queue log files are getting too numerous, checkpoint 754# and archive them. Some tests are so large (particularly from 755# run_repmethod) that they can consume far too much disk space. 756proc queue_logcheck { } { 757 global queueenv 758 759 760 set logs [$queueenv log_archive -arch_log] 761 set numlogs [llength $logs] 762 if { $numlogs > 10 } { 763 $queueenv txn_checkpoint 764 $queueenv log_archive -arch_remove 765 } 766} 767 768# Discard all the pending messages for a particular site. 769proc replclear { machid } { 770 global queuedbs queueenv 771 772 if { [info exists queuedbs($machid)] != 1 } { 773 error "FAIL: replclear: machid $machid not found" 774 } 775 776 set db $queuedbs($machid) 777 set txn [$queueenv txn] 778 set dbc [$db cursor -txn $txn] 779 for { set dbt [$dbc get -rmw -first] } { [llength $dbt] > 0 } \ 780 { set dbt [$dbc get -rmw -next] } { 781 error_check_good replclear($machid)_del [$dbc del] 0 782 } 783 error_check_good replclear($machid)_dbc_close [$dbc close] 0 784 error_check_good replclear($machid)_txn_commit [$txn commit] 0 785} 786 787# Add a machine to a replication environment. 788proc repladd { machid } { 789 global queueenv queuedbs machids 790 791 if { [info exists queuedbs($machid)] == 1 } { 792 error "FAIL: repladd: machid $machid already exists" 793 } 794 795 set queuedbs($machid) [berkdb open -auto_commit \ 796 -env $queueenv -create -recno -renumber repqueue$machid.db] 797 error_check_good repqueue_create [is_valid_db $queuedbs($machid)] TRUE 798 799 lappend machids $machid 800} 801 802# Acquire a handle to work with an existing machine's replication 803# queue. This is for situations where more than one process 804# is working with a message queue. In general, having more than one 805# process handle the queue is wrong. However, in order to test some 806# things, we need two processes (since Tcl doesn't support threads). We 807# go to great pain in the test harness to make sure this works, but we 808# don't let customers do it. 809proc repljoin { machid } { 810 global queueenv queuedbs machids 811 812 set queuedbs($machid) [berkdb open -auto_commit \ 813 -env $queueenv repqueue$machid.db] 814 error_check_good repqueue_create [is_valid_db $queuedbs($machid)] TRUE 815 816 lappend machids $machid 817} 818 819# Process a queue of messages, skipping every "skip_interval" entry. 820# We traverse the entire queue, but since we skip some messages, we 821# may end up leaving things in the queue, which should get picked up 822# on a later run. 823proc replprocessqueue { dbenv machid { skip_interval 0 } { hold_electp NONE } \ 824 { dupmasterp NONE } { errp NONE } } { 825 global queuedbs queueenv errorCode 826 global perm_response_list 827 global startup_done 828 829 # hold_electp is a call-by-reference variable which lets our caller 830 # know we need to hold an election. 831 if { [string compare $hold_electp NONE] != 0 } { 832 upvar $hold_electp hold_elect 833 } 834 set hold_elect 0 835 836 # dupmasterp is a call-by-reference variable which lets our caller 837 # know we have a duplicate master. 838 if { [string compare $dupmasterp NONE] != 0 } { 839 upvar $dupmasterp dupmaster 840 } 841 set dupmaster 0 842 843 # errp is a call-by-reference variable which lets our caller 844 # know we have gotten an error (that they expect). 845 if { [string compare $errp NONE] != 0 } { 846 upvar $errp errorp 847 } 848 set errorp 0 849 850 set nproced 0 851 852 set txn [$queueenv txn] 853 854 # If we are running separate processes, the second process has 855 # to join an existing message queue. 856 if { [info exists queuedbs($machid)] == 0 } { 857 repljoin $machid 858 } 859 860 set dbc [$queuedbs($machid) cursor -txn $txn] 861 862 error_check_good process_dbc($machid) \ 863 [is_valid_cursor $dbc $queuedbs($machid)] TRUE 864 865 for { set dbt [$dbc get -first] } \ 866 { [llength $dbt] != 0 } \ 867 { } { 868 set data [lindex [lindex $dbt 0] 1] 869 set recno [lindex [lindex $dbt 0] 0] 870 871 # If skip_interval is nonzero, we want to process messages 872 # out of order. We do this in a simple but slimy way-- 873 # continue walking with the cursor without processing the 874 # message or deleting it from the queue, but do increment 875 # "nproced". The way this proc is normally used, the 876 # precise value of nproced doesn't matter--we just don't 877 # assume the queues are empty if it's nonzero. Thus, 878 # if we contrive to make sure it's nonzero, we'll always 879 # come back to records we've skipped on a later call 880 # to replprocessqueue. (If there really are no records, 881 # we'll never get here.) 882 # 883 # Skip every skip_interval'th record (and use a remainder other 884 # than zero so that we're guaranteed to really process at least 885 # one record on every call). 886 if { $skip_interval != 0 } { 887 if { $nproced % $skip_interval == 1 } { 888 incr nproced 889 set dbt [$dbc get -next] 890 continue 891 } 892 } 893 894 # We need to remove the current message from the queue, 895 # because we're about to end the transaction and someone 896 # else processing messages might come in and reprocess this 897 # message which would be bad. 898 error_check_good queue_remove [$dbc del] 0 899 900 # We have to play an ugly cursor game here: we currently 901 # hold a lock on the page of messages, but rep_process_message 902 # might need to lock the page with a different cursor in 903 # order to send a response. So save the next recno, close 904 # the cursor, and then reopen and reset the cursor. 905 # If someone else is processing this queue, our entry might 906 # have gone away, and we need to be able to handle that. 907 908 error_check_good dbc_process_close [$dbc close] 0 909 error_check_good txn_commit [$txn commit] 0 910 911 set ret [catch {$dbenv rep_process_message \ 912 [lindex $data 2] [lindex $data 0] [lindex $data 1]} res] 913 914 # Save all ISPERM and NOTPERM responses so we can compare their 915 # LSNs to the LSN in the log. The variable perm_response_list 916 # holds the entire response so we can extract responses and 917 # LSNs as needed. 918 # 919 if { [llength $perm_response_list] != 0 && \ 920 ([is_substr $res ISPERM] || [is_substr $res NOTPERM]) } { 921 lappend perm_response_list $res 922 } 923 924 if { $ret != 0 } { 925 if { [string compare $errp NONE] != 0 } { 926 set errorp "$dbenv $machid $res" 927 } else { 928 error "FAIL:[timestamp]\ 929 rep_process_message returned $res" 930 } 931 } 932 933 incr nproced 934 935 # Now, re-establish the cursor position. We fetch the 936 # current record number. If there is something there, 937 # that is the record for the next iteration. If there 938 # is nothing there, then we've consumed the last item 939 # in the queue. 940 941 set txn [$queueenv txn] 942 set dbc [$queuedbs($machid) cursor -txn $txn] 943 set dbt [$dbc get -set_range $recno] 944 945 if { $ret == 0 } { 946 set rettype [lindex $res 0] 947 set retval [lindex $res 1] 948 # 949 # Do nothing for 0 and NEWSITE 950 # 951 if { [is_substr $rettype STARTUPDONE] } { 952 set startup_done 1 953 } 954 if { [is_substr $rettype HOLDELECTION] } { 955 set hold_elect 1 956 } 957 if { [is_substr $rettype DUPMASTER] } { 958 set dupmaster "1 $dbenv $machid" 959 } 960 if { [is_substr $rettype NOTPERM] || \ 961 [is_substr $rettype ISPERM] } { 962 set lsnfile [lindex $retval 0] 963 set lsnoff [lindex $retval 1] 964 } 965 } 966 967 if { $errorp != 0 } { 968 # Break also on an error, caller wants to handle it. 969 break 970 } 971 if { $hold_elect == 1 } { 972 # Break also on a HOLDELECTION, for the same reason. 973 break 974 } 975 if { $dupmaster == 1 } { 976 # Break also on a DUPMASTER, for the same reason. 977 break 978 } 979 980 } 981 982 error_check_good dbc_close [$dbc close] 0 983 error_check_good txn_commit [$txn commit] 0 984 985 # Return the number of messages processed. 986 return $nproced 987} 988 989 990set run_repl_flag "-run_repl" 991 992proc extract_repl_args { args } { 993 global run_repl_flag 994 995 for { set arg [lindex $args [set i 0]] } \ 996 { [string length $arg] > 0 } \ 997 { set arg [lindex $args [incr i]] } { 998 if { [string compare $arg $run_repl_flag] == 0 } { 999 return [lindex $args [expr $i + 1]] 1000 } 1001 } 1002 return "" 1003} 1004 1005proc delete_repl_args { args } { 1006 global run_repl_flag 1007 1008 set ret {} 1009 1010 for { set arg [lindex $args [set i 0]] } \ 1011 { [string length $arg] > 0 } \ 1012 { set arg [lindex $args [incr i]] } { 1013 if { [string compare $arg $run_repl_flag] != 0 } { 1014 lappend ret $arg 1015 } else { 1016 incr i 1017 } 1018 } 1019 return $ret 1020} 1021 1022global elect_serial 1023global elections_in_progress 1024set elect_serial 0 1025 1026# Start an election in a sub-process. 1027proc start_election \ 1028 { pfx qdir envstring nsites nvotes pri timeout {err "none"} {crash 0}} { 1029 source ./include.tcl 1030 global elect_serial elections_in_progress machids 1031 global rep_verbose 1032 1033 set filelist {} 1034 set ret [catch {glob $testdir/ELECTION*.$elect_serial} result] 1035 if { $ret == 0 } { 1036 set filelist [concat $filelist $result] 1037 } 1038 foreach f $filelist { 1039 fileremove -f $f 1040 } 1041 1042 set oid [open $testdir/ELECTION_SOURCE.$elect_serial w] 1043 1044 puts $oid "source $test_path/test.tcl" 1045 puts $oid "set elected_event 0" 1046 puts $oid "set elected_env \"NONE\"" 1047 puts $oid "set is_repchild 1" 1048 puts $oid "replsetup $qdir" 1049 foreach i $machids { puts $oid "repladd $i" } 1050 puts $oid "set env_cmd \{$envstring\}" 1051 if { $rep_verbose == 1 } { 1052 puts $oid "set dbenv \[eval \$env_cmd -errfile \ 1053 /dev/stdout -errpfx $pfx \]" 1054 } else { 1055 puts $oid "set dbenv \[eval \$env_cmd -errfile \ 1056 $testdir/ELECTION_ERRFILE.$elect_serial -errpfx $pfx \]" 1057 } 1058 puts $oid "\$dbenv test abort $err" 1059 puts $oid "set res \[catch \{\$dbenv rep_elect $nsites \ 1060 $nvotes $pri $timeout\} ret\]" 1061 puts $oid "set r \[open \$testdir/ELECTION_RESULT.$elect_serial w\]" 1062 puts $oid "if \{\$res == 0 \} \{" 1063 puts $oid "puts \$r \"SUCCESS \$ret\"" 1064 puts $oid "\} else \{" 1065 puts $oid "puts \$r \"ERROR \$ret\"" 1066 puts $oid "\}" 1067 # 1068 # This loop calls rep_elect a second time with the error cleared. 1069 # We don't want to do that if we are simulating a crash. 1070 if { $err != "none" && $crash != 1 } { 1071 puts $oid "\$dbenv test abort none" 1072 puts $oid "set res \[catch \{\$dbenv rep_elect $nsites \ 1073 $nvotes $pri $timeout\} ret\]" 1074 puts $oid "if \{\$res == 0 \} \{" 1075 puts $oid "puts \$r \"SUCCESS \$ret\"" 1076 puts $oid "\} else \{" 1077 puts $oid "puts \$r \"ERROR \$ret\"" 1078 puts $oid "\}" 1079 } 1080 1081 puts $oid "if \{ \$elected_event == 1 \} \{" 1082 puts $oid "puts \$r \"ELECTED \$elected_env\"" 1083 puts $oid "\}" 1084 1085 puts $oid "close \$r" 1086 close $oid 1087 1088 set t [open "|$tclsh_path >& $testdir/ELECTION_OUTPUT.$elect_serial" w] 1089 if { $rep_verbose } { 1090 set t [open "|$tclsh_path" w] 1091 } 1092 puts $t "source ./include.tcl" 1093 puts $t "source $testdir/ELECTION_SOURCE.$elect_serial" 1094 flush $t 1095 1096 set elections_in_progress($elect_serial) $t 1097 return $elect_serial 1098} 1099 1100# 1101# If we are doing elections during upgrade testing, set 1102# upgrade to 1. Doing that sets the priority to the 1103# test priority in rep_elect, which will simulate a 1104# 0-priority but electable site. 1105# 1106proc setpriority { priority nclients winner {start 0} {upgrade 0} } { 1107 global electable_pri 1108 upvar $priority pri 1109 1110 for { set i $start } { $i < [expr $nclients + $start] } { incr i } { 1111 if { $i == $winner } { 1112 set pri($i) 100 1113 } else { 1114 if { $upgrade } { 1115 set pri($i) $electable_pri 1116 } else { 1117 set pri($i) 10 1118 } 1119 } 1120 } 1121} 1122 1123# run_election has the following arguments: 1124# Arrays: 1125# ecmd Array of the commands for setting up each client env. 1126# cenv Array of the handles to each client env. 1127# errcmd Array of where errors should be forced. 1128# priority Array of the priorities of each client env. 1129# crash If an error is forced, should we crash or recover? 1130# The upvar command takes care of making these arrays available to 1131# the procedure. 1132# 1133# Ordinary variables: 1134# qdir Directory where the message queue is located. 1135# msg Message prefixed to the output. 1136# elector This client calls the first election. 1137# nsites Number of sites in the replication group. 1138# nvotes Number of votes required to win the election. 1139# nclients Number of clients participating in the election. 1140# win The expected winner of the election. 1141# reopen Should the new master (i.e. winner) be closed 1142# and reopened as a client? 1143# dbname Name of the underlying database. The caller 1144# should send in "NULL" if the database has not 1145# yet been created. 1146# ignore Should the winner ignore its own election? 1147# If ignore is 1, the winner is not made master. 1148# timeout_ok We expect that this election will not succeed 1149# in electing a new master (perhaps because there 1150# already is a master). 1151 1152proc run_election { ecmd celist errcmd priority crsh\ 1153 qdir msg elector nsites nvotes nclients win reopen\ 1154 dbname {ignore 0} {timeout_ok 0} } { 1155 1156 global elect_timeout elect_serial 1157 global is_hp_test 1158 global is_windows_test 1159 global rand_init 1160 upvar $ecmd env_cmd 1161 upvar $celist cenvlist 1162 upvar $errcmd err_cmd 1163 upvar $priority pri 1164 upvar $crsh crash 1165 1166 set elect_timeout(default) 15000000 1167 # Windows and HP-UX require a longer timeout. 1168 if { $is_windows_test == 1 || $is_hp_test == 1 } { 1169 set elect_timeout(default) [expr $elect_timeout(default) * 2] 1170 } 1171 1172 set long_timeout $elect_timeout(default) 1173 # 1174 # Initialize tries based on the default timeout. 1175 # We use tries to loop looking for messages because 1176 # as sites are sleeping waiting for their timeout 1177 # to expire we need to keep checking for messages. 1178 # 1179 set tries [expr [expr $long_timeout * 4] / 1000000] 1180 # 1181 # Retry indicates whether the test should retry the election 1182 # if it gets a timeout. This is primarily used for the 1183 # varied timeout election test because we expect short timeouts 1184 # to timeout when interacting with long timeouts and the 1185 # short timeout sites need to call elections again. 1186 # 1187 set retry 0 1188 foreach pair $cenvlist { 1189 set id [lindex $pair 1] 1190 set i [expr $id - 2] 1191 set elect_pipe($i) INVALID 1192 # 1193 # Array get should return us a list of 1 element: 1194 # { {$i timeout_value} } 1195 # If that doesn't exist, use the default. 1196 # 1197 set this_timeout [array get elect_timeout $i] 1198 if { [llength $this_timeout] } { 1199 set e_timeout($i) [lindex $this_timeout 1] 1200 # 1201 # Set number of tries based on the biggest 1202 # timeout we see in this group if using 1203 # varied timeouts. 1204 # 1205 set retry 1 1206 if { $e_timeout($i) > $long_timeout } { 1207 set long_timeout $e_timeout($i) 1208 set tries [expr $long_timeout / 1000000] 1209 } 1210 } else { 1211 set e_timeout($i) $elect_timeout(default) 1212 } 1213 replclear $id 1214 } 1215 1216 # 1217 # XXX 1218 # We need to somehow check for the warning if nvotes is not 1219 # a majority. Problem is that warning will go into the child 1220 # process' output. Furthermore, we need a mechanism that can 1221 # handle both sending the output to a file and sending it to 1222 # /dev/stderr when debugging without failing the 1223 # error_check_good check. 1224 # 1225 puts "\t\t$msg.1: Election with nsites=$nsites,\ 1226 nvotes=$nvotes, nclients=$nclients" 1227 puts "\t\t$msg.2: First elector is $elector,\ 1228 expected winner is $win (eid [expr $win + 2])" 1229 incr elect_serial 1230 set pfx "CHILD$elector.$elect_serial" 1231 set elect_pipe($elector) [start_election \ 1232 $pfx $qdir $env_cmd($elector) $nsites $nvotes $pri($elector) \ 1233 $e_timeout($elector) $err_cmd($elector) $crash($elector)] 1234 tclsleep 2 1235 1236 set got_newmaster 0 1237 set max_retry $tries 1238 1239 # If we're simulating a crash, skip the while loop and 1240 # just give the initial election a chance to complete. 1241 set crashing 0 1242 for { set i 0 } { $i < $nclients } { incr i } { 1243 if { $crash($i) == 1 } { 1244 set crashing 1 1245 } 1246 } 1247 1248 global elected_event 1249 global elected_env 1250 set elected_event 0 1251 set c_elected_event 0 1252 set elected_env "NONE" 1253 1254 set orig_tries $tries 1255 if { $crashing == 1 } { 1256 tclsleep 10 1257 } else { 1258 set retry_cnt 0 1259 while { 1 } { 1260 set nproced 0 1261 set he 0 1262 set winning_envid -1 1263 set c_winning_envid -1 1264 1265 foreach pair $cenvlist { 1266 set he 0 1267 set unavail 0 1268 set envid [lindex $pair 1] 1269 set i [expr $envid - 2] 1270 set clientenv($i) [lindex $pair 0] 1271 1272 # If the "elected" event is received by the 1273 # child process, the env set up in that child 1274 # is the elected env. 1275 set child_done [check_election $elect_pipe($i)\ 1276 unavail c_elected_event c_elected_env] 1277 if { $c_elected_event != 0 } { 1278 set elected_event 1 1279 set c_winning_envid $envid 1280 set c_elected_event 0 1281 } 1282 1283 incr nproced [replprocessqueue \ 1284 $clientenv($i) $envid 0 he] 1285# puts "Tries $tries:\ 1286# Processed queue for client $i, $nproced msgs he $he unavail $unavail" 1287 1288 # Check for completed election. If it's the 1289 # first time we've noticed it, deal with it. 1290 if { $elected_event == 1 && \ 1291 $got_newmaster == 0 } { 1292 set got_newmaster 1 1293 1294 # Find env id of winner. 1295 if { $c_winning_envid != -1 } { 1296 set winning_envid \ 1297 $c_winning_envid 1298 set c_winning_envid -1 1299 } else { 1300 foreach pair $cenvlist { 1301 if { [lindex $pair 0]\ 1302 == $elected_env } { 1303 set winning_envid \ 1304 [lindex $pair 1] 1305 break 1306 } 1307 } 1308 } 1309 1310 # Make sure it's the expected winner. 1311 error_check_good right_winner \ 1312 $winning_envid [expr $win + 2] 1313 1314 # Reconfigure winning env as master. 1315 if { $ignore == 0 } { 1316 $clientenv($i) errpfx \ 1317 NEWMASTER 1318 error_check_good \ 1319 make_master($i) \ 1320 [$clientenv($i) \ 1321 rep_start -master] 0 1322 1323 # Don't hold another election 1324 # yet if we are setting up a 1325 # new master. This could 1326 # cause the new master to 1327 # declare itself a client 1328 # during internal init. 1329 set he 0 1330 } 1331 1332 # Occasionally force new log records 1333 # to be written, unless the database 1334 # has not yet been created. 1335 set write [berkdb random_int 1 10] 1336 if { $write == 1 && $dbname != "NULL" } { 1337 set db [eval berkdb_open_noerr \ 1338 -env $clientenv($i) \ 1339 -auto_commit $dbname] 1340 error_check_good dbopen \ 1341 [is_valid_db $db] TRUE 1342 error_check_good dbclose \ 1343 [$db close] 0 1344 } 1345 } 1346 1347 # If the previous election failed with a 1348 # timeout and we need to retry because we 1349 # are testing varying site timeouts, force 1350 # a hold election to start a new one. 1351 if { $unavail && $retry && $retry_cnt < $max_retry} { 1352 incr retry_cnt 1353 puts "\t\t$msg.2.b: Client $i timed\ 1354 out. Retry $retry_cnt\ 1355 of max $max_retry" 1356 set he 1 1357 set tries $orig_tries 1358 } 1359 if { $he == 1 && $got_newmaster == 0 } { 1360 # 1361 # Only close down the election pipe if the 1362 # previously created one is done and 1363 # waiting for new commands, otherwise 1364 # if we try to close it while it's in 1365 # progress we hang this main tclsh. 1366 # 1367 if { $elect_pipe($i) != "INVALID" && \ 1368 $child_done == 1 } { 1369 close_election $elect_pipe($i) 1370 set elect_pipe($i) "INVALID" 1371 } 1372# puts "Starting election on client $i" 1373 if { $elect_pipe($i) == "INVALID" } { 1374 incr elect_serial 1375 set pfx "CHILD$i.$elect_serial" 1376 set elect_pipe($i) [start_election \ 1377 $pfx $qdir \ 1378 $env_cmd($i) $nsites \ 1379 $nvotes $pri($i) $e_timeout($i)] 1380 set got_hold_elect($i) 1 1381 } 1382 } 1383 } 1384 1385 # We need to wait around to make doubly sure that the 1386 # election has finished... 1387 if { $nproced == 0 } { 1388 incr tries -1 1389 # 1390 # If we have a newmaster already, set tries 1391 # down to just allow straggling messages to 1392 # be processed. Tries could be a very large 1393 # number if we have long timeouts. 1394 # 1395 if { $got_newmaster != 0 && $tries > 10 } { 1396 set tries 10 1397 } 1398 if { $tries == 0 } { 1399 break 1400 } else { 1401 tclsleep 1 1402 } 1403 } else { 1404 set tries $tries 1405 } 1406 } 1407 1408 # If we did get a new master, its identity was checked 1409 # at that time. But we still have to make sure that we 1410 # didn't just time out. 1411 1412 if { $got_newmaster == 0 && $timeout_ok == 0 } { 1413 error "FAIL: Did not elect new master." 1414 } 1415 } 1416 cleanup_elections 1417 1418 # 1419 # Make sure we've really processed all the post-election 1420 # sync-up messages. If we're simulating a crash, don't process 1421 # any more messages. 1422 # 1423 if { $crashing == 0 } { 1424 process_msgs $cenvlist 1425 } 1426 1427 if { $reopen == 1 } { 1428 puts "\t\t$msg.3: Closing new master and reopening as client" 1429 error_check_good log_flush [$clientenv($win) log_flush] 0 1430 error_check_good newmaster_close [$clientenv($win) close] 0 1431 1432 set clientenv($win) [eval $env_cmd($win)] 1433 error_check_good cl($win) [is_valid_env $clientenv($win)] TRUE 1434 set newelector "$clientenv($win) [expr $win + 2]" 1435 set cenvlist [lreplace $cenvlist $win $win $newelector] 1436 if { $crashing == 0 } { 1437 process_msgs $cenvlist 1438 } 1439 } 1440} 1441 1442proc check_election { id unavailp elected_eventp elected_envp } { 1443 source ./include.tcl 1444 1445 if { $id == "INVALID" } { 1446 return 0 1447 } 1448 upvar $unavailp unavail 1449 upvar $elected_eventp elected_event 1450 upvar $elected_envp elected_env 1451 1452 set unavail 0 1453 set elected_event 0 1454 set elected_env "NONE" 1455 1456 set res [catch {open $testdir/ELECTION_RESULT.$id} nmid] 1457 if { $res != 0 } { 1458 return 0 1459 } 1460 while { [gets $nmid val] != -1 } { 1461# puts "result $id: $val" 1462 set str [lindex $val 0] 1463 if { [is_substr $val UNAVAIL] } { 1464 set unavail 1 1465 } 1466 if { [is_substr $val ELECTED] } { 1467 set elected_event 1 1468 set elected_env [lindex $val 1] 1469 } 1470 } 1471 close $nmid 1472 return 1 1473} 1474 1475proc close_election { i } { 1476 global elections_in_progress 1477 global noenv_messaging 1478 global qtestdir 1479 1480 if { $noenv_messaging == 1 } { 1481 set testdir $qtestdir 1482 } 1483 1484 set t $elections_in_progress($i) 1485 puts $t "replclose \$testdir/MSGQUEUEDIR" 1486 puts $t "\$dbenv close" 1487 close $t 1488 unset elections_in_progress($i) 1489} 1490 1491proc cleanup_elections { } { 1492 global elect_serial elections_in_progress 1493 1494 for { set i 0 } { $i <= $elect_serial } { incr i } { 1495 if { [info exists elections_in_progress($i)] != 0 } { 1496 close_election $i 1497 } 1498 } 1499 1500 set elect_serial 0 1501} 1502 1503# 1504# This is essentially a copy of test001, but it only does the put/get 1505# loop AND it takes an already-opened db handle. 1506# 1507proc rep_test { method env repdb {nentries 10000} \ 1508 {start 0} {skip 0} {needpad 0} args } { 1509 1510 source ./include.tcl 1511 global databases_in_memory 1512 1513 # 1514 # Open the db if one isn't given. Close before exit. 1515 # 1516 if { $repdb == "NULL" } { 1517 if { $databases_in_memory == 1 } { 1518 set testfile { "" "test.db" } 1519 } else { 1520 set testfile "test.db" 1521 } 1522 set largs [convert_args $method $args] 1523 set omethod [convert_method $method] 1524 set db [eval {berkdb_open_noerr} -env $env -auto_commit\ 1525 -create -mode 0644 $omethod $largs $testfile] 1526 error_check_good reptest_db [is_valid_db $db] TRUE 1527 } else { 1528 set db $repdb 1529 } 1530 1531 puts "\t\tRep_test: $method $nentries key/data pairs starting at $start" 1532 set did [open $dict] 1533 1534 # The "start" variable determines the record number to start 1535 # with, if we're using record numbers. The "skip" variable 1536 # determines which dictionary entry to start with. In normal 1537 # use, skip is equal to start. 1538 1539 if { $skip != 0 } { 1540 for { set count 0 } { $count < $skip } { incr count } { 1541 gets $did str 1542 } 1543 } 1544 set pflags "" 1545 set gflags "" 1546 set txn "" 1547 1548 if { [is_record_based $method] == 1 } { 1549 append gflags " -recno" 1550 } 1551 puts "\t\tRep_test.a: put/get loop" 1552 # Here is the loop where we put and get each key/data pair 1553 set count 0 1554 1555 # Checkpoint 10 times during the run, but not more 1556 # frequently than every 5 entries. 1557 set checkfreq [expr $nentries / 10] 1558 1559 # Abort occasionally during the run. 1560 set abortfreq [expr $nentries / 15] 1561 1562 while { [gets $did str] != -1 && $count < $nentries } { 1563 if { [is_record_based $method] == 1 } { 1564 global kvals 1565 1566 set key [expr $count + 1 + $start] 1567 if { 0xffffffff > 0 && $key > 0xffffffff } { 1568 set key [expr $key - 0x100000000] 1569 } 1570 if { $key == 0 || $key - 0xffffffff == 1 } { 1571 incr key 1572 incr count 1573 } 1574 set kvals($key) [pad_data $method $str] 1575 } else { 1576 set key $str 1577 set str [reverse $str] 1578 } 1579 # 1580 # We want to make sure we send in exactly the same 1581 # length data so that LSNs match up for some tests 1582 # in replication (rep021). 1583 # 1584 if { [is_fixed_length $method] == 1 && $needpad } { 1585 # 1586 # Make it something visible and obvious, 'A'. 1587 # 1588 set p 65 1589 set str [make_fixed_length $method $str $p] 1590 set kvals($key) $str 1591 } 1592 set t [$env txn] 1593 error_check_good txn [is_valid_txn $t $env] TRUE 1594 set txn "-txn $t" 1595 set ret [eval \ 1596 {$db put} $txn $pflags {$key [chop_data $method $str]}] 1597 error_check_good put $ret 0 1598 error_check_good txn [$t commit] 0 1599 1600 if { $checkfreq < 5 } { 1601 set checkfreq 5 1602 } 1603 if { $abortfreq < 3 } { 1604 set abortfreq 3 1605 } 1606 # 1607 # Do a few aborted transactions to test that 1608 # aborts don't get processed on clients and the 1609 # master handles them properly. Just abort 1610 # trying to delete the key we just added. 1611 # 1612 if { $count % $abortfreq == 0 } { 1613 set t [$env txn] 1614 error_check_good txn [is_valid_txn $t $env] TRUE 1615 set ret [$db del -txn $t $key] 1616 error_check_good txn [$t abort] 0 1617 } 1618 if { $count % $checkfreq == 0 } { 1619 error_check_good txn_checkpoint($count) \ 1620 [$env txn_checkpoint] 0 1621 } 1622 incr count 1623 } 1624 close $did 1625 if { $repdb == "NULL" } { 1626 error_check_good rep_close [$db close] 0 1627 } 1628} 1629 1630# 1631# This is essentially a copy of rep_test, but it only does the put/get 1632# loop in a long running txn to an open db. We use it for bulk testing 1633# because we want to fill the bulk buffer some before sending it out. 1634# Bulk buffer gets transmitted on every commit. 1635# 1636proc rep_test_bulk { method env repdb {nentries 10000} \ 1637 {start 0} {skip 0} {useoverflow 0} args } { 1638 source ./include.tcl 1639 1640 global overflowword1 1641 global overflowword2 1642 global databases_in_memory 1643 1644 if { [is_fixed_length $method] && $useoverflow == 1 } { 1645 puts "Skipping overflow for fixed length method $method" 1646 return 1647 } 1648 # 1649 # Open the db if one isn't given. Close before exit. 1650 # 1651 if { $repdb == "NULL" } { 1652 if { $databases_in_memory == 1 } { 1653 set testfile { "" "test.db" } 1654 } else { 1655 set testfile "test.db" 1656 } 1657 set largs [convert_args $method $args] 1658 set omethod [convert_method $method] 1659 set db [eval {berkdb_open_noerr -env $env -auto_commit -create \ 1660 -mode 0644} $largs $omethod $testfile] 1661 error_check_good reptest_db [is_valid_db $db] TRUE 1662 } else { 1663 set db $repdb 1664 } 1665 1666 # 1667 # If we are using an env, then testfile should just be the db name. 1668 # Otherwise it is the test directory and the name. 1669 # If we are not using an external env, then test setting 1670 # the database cache size and using multiple caches. 1671 puts \ 1672"\t\tRep_test_bulk: $method $nentries key/data pairs starting at $start" 1673 set did [open $dict] 1674 1675 # The "start" variable determines the record number to start 1676 # with, if we're using record numbers. The "skip" variable 1677 # determines which dictionary entry to start with. In normal 1678 # use, skip is equal to start. 1679 1680 if { $skip != 0 } { 1681 for { set count 0 } { $count < $skip } { incr count } { 1682 gets $did str 1683 } 1684 } 1685 set pflags "" 1686 set gflags "" 1687 set txn "" 1688 1689 if { [is_record_based $method] == 1 } { 1690 append gflags " -recno" 1691 } 1692 puts "\t\tRep_test_bulk.a: put/get loop in 1 txn" 1693 # Here is the loop where we put and get each key/data pair 1694 set count 0 1695 1696 set t [$env txn] 1697 error_check_good txn [is_valid_txn $t $env] TRUE 1698 set txn "-txn $t" 1699 set pid [pid] 1700 while { [gets $did str] != -1 && $count < $nentries } { 1701 if { [is_record_based $method] == 1 } { 1702 global kvals 1703 1704 set key [expr $count + 1 + $start] 1705 if { 0xffffffff > 0 && $key > 0xffffffff } { 1706 set key [expr $key - 0x100000000] 1707 } 1708 if { $key == 0 || $key - 0xffffffff == 1 } { 1709 incr key 1710 incr count 1711 } 1712 set kvals($key) [pad_data $method $str] 1713 if { [is_fixed_length $method] == 0 } { 1714 set str [repeat $str 100] 1715 } 1716 } else { 1717 set key $str.$pid 1718 set str [repeat $str 100] 1719 } 1720 # 1721 # For use for overflow test. 1722 # 1723 if { $useoverflow == 0 } { 1724 if { [string length $overflowword1] < \ 1725 [string length $str] } { 1726 set overflowword2 $overflowword1 1727 set overflowword1 $str 1728 } 1729 } else { 1730 if { $count == 0 } { 1731 set len [string length $overflowword1] 1732 set word $overflowword1 1733 } else { 1734 set len [string length $overflowword2] 1735 set word $overflowword1 1736 } 1737 set rpt [expr 1024 * 1024 / $len] 1738 incr rpt 1739 set str [repeat $word $rpt] 1740 } 1741 set ret [eval \ 1742 {$db put} $txn $pflags {$key [chop_data $method $str]}] 1743 error_check_good put $ret 0 1744 incr count 1745 } 1746 error_check_good txn [$t commit] 0 1747 error_check_good txn_checkpoint [$env txn_checkpoint] 0 1748 close $did 1749 if { $repdb == "NULL" } { 1750 error_check_good rep_close [$db close] 0 1751 } 1752} 1753 1754proc rep_test_upg { method env repdb {nentries 10000} \ 1755 {start 0} {skip 0} {needpad 0} {inmem 0} args } { 1756 1757 source ./include.tcl 1758 1759 # 1760 # Open the db if one isn't given. Close before exit. 1761 # 1762 if { $repdb == "NULL" } { 1763 if { $inmem == 1 } { 1764 set testfile { "" "test.db" } 1765 } else { 1766 set testfile "test.db" 1767 } 1768 set largs [convert_args $method $args] 1769 set omethod [convert_method $method] 1770 set db [eval {berkdb_open_noerr} -env $env -auto_commit\ 1771 -create -mode 0644 $omethod $largs $testfile] 1772 error_check_good reptest_db [is_valid_db $db] TRUE 1773 } else { 1774 set db $repdb 1775 } 1776 1777 set pid [pid] 1778 puts "\t\tRep_test_upg($pid): $method $nentries key/data pairs starting at $start" 1779 set did [open $dict] 1780 1781 # The "start" variable determines the record number to start 1782 # with, if we're using record numbers. The "skip" variable 1783 # determines which dictionary entry to start with. In normal 1784 # use, skip is equal to start. 1785 1786 if { $skip != 0 } { 1787 for { set count 0 } { $count < $skip } { incr count } { 1788 gets $did str 1789 } 1790 } 1791 set pflags "" 1792 set gflags "" 1793 set txn "" 1794 1795 if { [is_record_based $method] == 1 } { 1796 append gflags " -recno" 1797 } 1798 puts "\t\tRep_test.a: put/get loop" 1799 # Here is the loop where we put and get each key/data pair 1800 set count 0 1801 1802 # Checkpoint 10 times during the run, but not more 1803 # frequently than every 5 entries. 1804 set checkfreq [expr $nentries / 10] 1805 1806 # Abort occasionally during the run. 1807 set abortfreq [expr $nentries / 15] 1808 1809 while { [gets $did str] != -1 && $count < $nentries } { 1810 if { [is_record_based $method] == 1 } { 1811 global kvals 1812 1813 set key [expr $count + 1 + $start] 1814 if { 0xffffffff > 0 && $key > 0xffffffff } { 1815 set key [expr $key - 0x100000000] 1816 } 1817 if { $key == 0 || $key - 0xffffffff == 1 } { 1818 incr key 1819 incr count 1820 } 1821 set kvals($key) [pad_data $method $str] 1822 } else { 1823 # 1824 # With upgrade test, we run the same test several 1825 # times with the same database. We want to have 1826 # some overwritten records and some new records. 1827 # Therefore append our pid to half the keys. 1828 # 1829 if { $count % 2 } { 1830 set key $str.$pid 1831 } else { 1832 set key $str 1833 } 1834 set str [reverse $str] 1835 } 1836 # 1837 # We want to make sure we send in exactly the same 1838 # length data so that LSNs match up for some tests 1839 # in replication (rep021). 1840 # 1841 if { [is_fixed_length $method] == 1 && $needpad } { 1842 # 1843 # Make it something visible and obvious, 'A'. 1844 # 1845 set p 65 1846 set str [make_fixed_length $method $str $p] 1847 set kvals($key) $str 1848 } 1849 set t [$env txn] 1850 error_check_good txn [is_valid_txn $t $env] TRUE 1851 set txn "-txn $t" 1852# puts "rep_test_upg: put $count of $nentries: key $key, data $str" 1853 set ret [eval \ 1854 {$db put} $txn $pflags {$key [chop_data $method $str]}] 1855 error_check_good put $ret 0 1856 error_check_good txn [$t commit] 0 1857 1858 if { $checkfreq < 5 } { 1859 set checkfreq 5 1860 } 1861 if { $abortfreq < 3 } { 1862 set abortfreq 3 1863 } 1864 # 1865 # Do a few aborted transactions to test that 1866 # aborts don't get processed on clients and the 1867 # master handles them properly. Just abort 1868 # trying to delete the key we just added. 1869 # 1870 if { $count % $abortfreq == 0 } { 1871 set t [$env txn] 1872 error_check_good txn [is_valid_txn $t $env] TRUE 1873 set ret [$db del -txn $t $key] 1874 error_check_good txn [$t abort] 0 1875 } 1876 if { $count % $checkfreq == 0 } { 1877 error_check_good txn_checkpoint($count) \ 1878 [$env txn_checkpoint] 0 1879 } 1880 incr count 1881 } 1882 close $did 1883 if { $repdb == "NULL" } { 1884 error_check_good rep_close [$db close] 0 1885 } 1886} 1887 1888proc rep_test_upg.check { key data } { 1889 # 1890 # If the key has the pid attached, strip it off before checking. 1891 # If the key does not have the pid attached, then it is a recno 1892 # and we're done. 1893 # 1894 set i [string first . $key] 1895 if { $i != -1 } { 1896 set key [string replace $key $i end] 1897 } 1898 error_check_good "key/data mismatch" $data [reverse $key] 1899} 1900 1901proc rep_test_upg.recno.check { key data } { 1902 # 1903 # If we're a recno database we better not have a pid in the key. 1904 # Otherwise we're done. 1905 # 1906 set i [string first . $key] 1907 error_check_good pid $i -1 1908} 1909 1910# 1911# This is the basis for a number of simple repmgr test cases. It creates 1912# an appointed master and two clients, calls rep_test to process some records 1913# and verifies the resulting databases. The following parameters control 1914# runtime options: 1915# niter - number of records to process 1916# inmemdb - put databases in-memory (0, 1) 1917# inmemlog - put logs in-memory (0, 1) 1918# peer - make the second client a peer of the first client (0, 1) 1919# bulk - use bulk processing (0, 1) 1920# inmemrep - put replication files in-memory (0, 1) 1921# 1922proc basic_repmgr_test { method niter tnum inmemdb inmemlog peer bulk \ 1923 inmemrep largs } { 1924 global testdir 1925 global rep_verbose 1926 global verbose_type 1927 global overflowword1 1928 global overflowword2 1929 global databases_in_memory 1930 set overflowword1 "0" 1931 set overflowword2 "0" 1932 set nsites 3 1933 1934 # Set databases_in_memory for this test, preserving original value. 1935 if { $inmemdb } { 1936 set restore_dbinmem $databases_in_memory 1937 set databases_in_memory 1 1938 } 1939 1940 set verbargs "" 1941 if { $rep_verbose == 1 } { 1942 set verbargs " -verbose {$verbose_type on} " 1943 } 1944 1945 env_cleanup $testdir 1946 set ports [available_ports $nsites] 1947 1948 set masterdir $testdir/MASTERDIR 1949 set clientdir $testdir/CLIENTDIR 1950 set clientdir2 $testdir/CLIENTDIR2 1951 1952 file mkdir $masterdir 1953 file mkdir $clientdir 1954 file mkdir $clientdir2 1955 1956 # In-memory logs require a large log buffer, and cannot 1957 # be used with -txn nosync. Adjust the args. 1958 if { $inmemlog } { 1959 set logtype "in-memory" 1960 } else { 1961 set logtype "on-disk" 1962 } 1963 set logargs [adjust_logargs $logtype] 1964 set txnargs [adjust_txnargs $logtype] 1965 1966 # Determine in-memory replication argument for environments. 1967 if { $inmemrep } { 1968 set repmemarg "-rep_inmem_files " 1969 } else { 1970 set repmemarg "" 1971 } 1972 1973 # Use different connection retry timeout values to handle any 1974 # collisions from starting sites at the same time by retrying 1975 # at different times. 1976 1977 # Open a master. 1978 puts "\tRepmgr$tnum.a: Start an appointed master." 1979 set ma_envcmd "berkdb_env_noerr -create $logargs $verbargs \ 1980 -errpfx MASTER -home $masterdir $txnargs -rep -thread \ 1981 -lock_max_locks 10000 -lock_max_objects 10000 $repmemarg" 1982 set masterenv [eval $ma_envcmd] 1983 $masterenv repmgr -ack all -nsites $nsites \ 1984 -timeout {conn_retry 20000000} \ 1985 -local [list localhost [lindex $ports 0]] \ 1986 -start master 1987 1988 # Open first client 1989 puts "\tRepmgr$tnum.b: Start first client." 1990 set cl_envcmd "berkdb_env_noerr -create $verbargs $logargs \ 1991 -errpfx CLIENT -home $clientdir $txnargs -rep -thread \ 1992 -lock_max_locks 10000 -lock_max_objects 10000 $repmemarg" 1993 set clientenv [eval $cl_envcmd] 1994 $clientenv repmgr -ack all -nsites $nsites \ 1995 -timeout {conn_retry 10000000} \ 1996 -local [list localhost [lindex $ports 1]] \ 1997 -remote [list localhost [lindex $ports 0]] \ 1998 -remote [list localhost [lindex $ports 2]] \ 1999 -start client 2000 await_startup_done $clientenv 2001 2002 # Open second client 2003 puts "\tRepmgr$tnum.c: Start second client." 2004 set cl2_envcmd "berkdb_env_noerr -create $verbargs $logargs \ 2005 -errpfx CLIENT2 -home $clientdir2 $txnargs -rep -thread \ 2006 -lock_max_locks 10000 -lock_max_objects 10000 $repmemarg" 2007 set clientenv2 [eval $cl2_envcmd] 2008 if { $peer } { 2009 $clientenv2 repmgr -ack all -nsites $nsites \ 2010 -timeout {conn_retry 5000000} \ 2011 -local [list localhost [lindex $ports 2]] \ 2012 -remote [list localhost [lindex $ports 0]] \ 2013 -remote [list localhost [lindex $ports 1] peer] \ 2014 -start client 2015 } else { 2016 $clientenv2 repmgr -ack all -nsites $nsites \ 2017 -timeout {conn_retry 5000000} \ 2018 -local [list localhost [lindex $ports 2]] \ 2019 -remote [list localhost [lindex $ports 0]] \ 2020 -remote [list localhost [lindex $ports 1]] \ 2021 -start client 2022 } 2023 await_startup_done $clientenv2 2024 2025 # 2026 # Use of -ack all guarantees replication complete before repmgr send 2027 # function returns and rep_test finishes. 2028 # 2029 puts "\tRepmgr$tnum.d: Run some transactions at master." 2030 if { $bulk } { 2031 # Turn on bulk processing on master. 2032 error_check_good set_bulk [$masterenv rep_config {bulk on}] 0 2033 2034 eval rep_test_bulk $method $masterenv NULL $niter 0 0 0 $largs 2035 2036 # Must turn off bulk because some configs (debug_rop/wop) 2037 # generate log records when verifying databases. 2038 error_check_good set_bulk [$masterenv rep_config {bulk off}] 0 2039 } else { 2040 eval rep_test $method $masterenv NULL $niter 0 0 0 $largs 2041 } 2042 2043 puts "\tRepmgr$tnum.e: Verifying client database contents." 2044 rep_verify $masterdir $masterenv $clientdir $clientenv 1 1 1 2045 rep_verify $masterdir $masterenv $clientdir2 $clientenv2 1 1 1 2046 2047 # For in-memory replication, verify replication files not there. 2048 if { $inmemrep } { 2049 puts "\tRepmgr$tnum.f: Verify no replication files on disk." 2050 no_rep_files_on_disk $masterdir 2051 no_rep_files_on_disk $clientdir 2052 no_rep_files_on_disk $clientdir2 2053 } 2054 2055 # Restore original databases_in_memory value. 2056 if { $inmemdb } { 2057 set databases_in_memory $restore_dbinmem 2058 } 2059 2060 error_check_good client2_close [$clientenv2 close] 0 2061 error_check_good client_close [$clientenv close] 0 2062 error_check_good masterenv_close [$masterenv close] 0 2063} 2064 2065# 2066# This is the basis for simple repmgr election test cases. It opens three 2067# clients of different priorities and makes sure repmgr elects the 2068# expected master. Then it shuts the master down and makes sure repmgr 2069# elects the expected remaining client master. Then it makes sure the former 2070# master can join as a client. The following parameters control 2071# runtime options: 2072# niter - number of records to process 2073# inmemrep - put replication files in-memory (0, 1) 2074# 2075proc basic_repmgr_election_test { method niter tnum inmemrep largs } { 2076 global rep_verbose 2077 global testdir 2078 global verbose_type 2079 set nsites 3 2080 2081 set verbargs "" 2082 if { $rep_verbose == 1 } { 2083 set verbargs " -verbose {$verbose_type on} " 2084 } 2085 2086 env_cleanup $testdir 2087 set ports [available_ports $nsites] 2088 2089 set clientdir $testdir/CLIENTDIR 2090 set clientdir2 $testdir/CLIENTDIR2 2091 set clientdir3 $testdir/CLIENTDIR3 2092 2093 file mkdir $clientdir 2094 file mkdir $clientdir2 2095 file mkdir $clientdir3 2096 2097 # Determine in-memory replication argument for environments. 2098 if { $inmemrep } { 2099 set repmemarg "-rep_inmem_files " 2100 } else { 2101 set repmemarg "" 2102 } 2103 2104 # Use different connection retry timeout values to handle any 2105 # collisions from starting sites at the same time by retrying 2106 # at different times. 2107 2108 puts "\tRepmgr$tnum.a: Start three clients." 2109 2110 # Open first client 2111 set cl_envcmd "berkdb_env_noerr -create $verbargs \ 2112 -errpfx CLIENT -home $clientdir -txn -rep -thread $repmemarg" 2113 set clientenv [eval $cl_envcmd] 2114 $clientenv repmgr -ack all -nsites $nsites -pri 100 \ 2115 -timeout {conn_retry 20000000} \ 2116 -local [list localhost [lindex $ports 0]] \ 2117 -remote [list localhost [lindex $ports 1]] \ 2118 -remote [list localhost [lindex $ports 2]] \ 2119 -start elect 2120 2121 # Open second client 2122 set cl2_envcmd "berkdb_env_noerr -create $verbargs \ 2123 -errpfx CLIENT2 -home $clientdir2 -txn -rep -thread $repmemarg" 2124 set clientenv2 [eval $cl2_envcmd] 2125 $clientenv2 repmgr -ack all -nsites $nsites -pri 30 \ 2126 -timeout {conn_retry 10000000} \ 2127 -local [list localhost [lindex $ports 1]] \ 2128 -remote [list localhost [lindex $ports 0]] \ 2129 -remote [list localhost [lindex $ports 2]] \ 2130 -start elect 2131 2132 # Open third client 2133 set cl3_envcmd "berkdb_env_noerr -create $verbargs \ 2134 -errpfx CLIENT3 -home $clientdir3 -txn -rep -thread $repmemarg" 2135 set clientenv3 [eval $cl3_envcmd] 2136 $clientenv3 repmgr -ack all -nsites $nsites -pri 20 \ 2137 -timeout {conn_retry 5000000} \ 2138 -local [list localhost [lindex $ports 2]] \ 2139 -remote [list localhost [lindex $ports 0]] \ 2140 -remote [list localhost [lindex $ports 1]] \ 2141 -start elect 2142 2143 puts "\tRepmgr$tnum.b: Elect first client master." 2144 await_expected_master $clientenv 2145 set masterenv $clientenv 2146 set masterdir $clientdir 2147 await_startup_done $clientenv2 2148 await_startup_done $clientenv3 2149 2150 # 2151 # Use of -ack all guarantees replication complete before repmgr send 2152 # function returns and rep_test finishes. 2153 # 2154 puts "\tRepmgr$tnum.c: Run some transactions at master." 2155 eval rep_test $method $masterenv NULL $niter 0 0 0 $largs 2156 2157 puts "\tRepmgr$tnum.d: Verify client database contents." 2158 rep_verify $masterdir $masterenv $clientdir2 $clientenv2 1 1 1 2159 rep_verify $masterdir $masterenv $clientdir3 $clientenv3 1 1 1 2160 2161 puts "\tRepmgr$tnum.e: Shut down master, elect second client master." 2162 error_check_good client_close [$clientenv close] 0 2163 await_expected_master $clientenv2 2164 set masterenv $clientenv2 2165 await_startup_done $clientenv3 2166 2167 puts "\tRepmgr$tnum.f: Restart former master as client." 2168 # Open -recover to clear env region, including startup_done value. 2169 set clientenv [eval $cl_envcmd -recover] 2170 $clientenv repmgr -ack all -nsites $nsites -pri 100 \ 2171 -timeout {conn_retry 20000000} \ 2172 -local [list localhost [lindex $ports 0]] \ 2173 -remote [list localhost [lindex $ports 1]] \ 2174 -remote [list localhost [lindex $ports 2]] \ 2175 -start client 2176 await_startup_done $clientenv 2177 2178 puts "\tRepmgr$tnum.g: Run some transactions at new master." 2179 eval rep_test $method $masterenv NULL $niter $niter 0 0 $largs 2180 2181 puts "\tRepmgr$tnum.h: Verify client database contents." 2182 set masterdir $clientdir2 2183 rep_verify $masterdir $masterenv $clientdir $clientenv 1 1 1 2184 rep_verify $masterdir $masterenv $clientdir3 $clientenv3 1 1 1 2185 2186 # For in-memory replication, verify replication files not there. 2187 if { $inmemrep } { 2188 puts "\tRepmgr$tnum.i: Verify no replication files on disk." 2189 no_rep_files_on_disk $clientdir 2190 no_rep_files_on_disk $clientdir2 2191 no_rep_files_on_disk $clientdir3 2192 } 2193 2194 error_check_good client3_close [$clientenv3 close] 0 2195 error_check_good client_close [$clientenv close] 0 2196 error_check_good client2_close [$clientenv2 close] 0 2197} 2198 2199# 2200# This is the basis for simple repmgr internal init test cases. It starts 2201# an appointed master and two clients, processing transactions between each 2202# additional site. Then it verifies all expected transactions are 2203# replicated. The following parameters control runtime options: 2204# niter - number of records to process 2205# inmemrep - put replication files in-memory (0, 1) 2206# 2207proc basic_repmgr_init_test { method niter tnum inmemrep largs } { 2208 global rep_verbose 2209 global testdir 2210 global verbose_type 2211 set nsites 3 2212 2213 set verbargs "" 2214 if { $rep_verbose == 1 } { 2215 set verbargs " -verbose {$verbose_type on} " 2216 } 2217 2218 env_cleanup $testdir 2219 set ports [available_ports $nsites] 2220 2221 set masterdir $testdir/MASTERDIR 2222 set clientdir $testdir/CLIENTDIR 2223 set clientdir2 $testdir/CLIENTDIR2 2224 2225 file mkdir $masterdir 2226 file mkdir $clientdir 2227 file mkdir $clientdir2 2228 2229 # Determine in-memory replication argument for environments. 2230 if { $inmemrep } { 2231 set repmemarg "-rep_inmem_files " 2232 } else { 2233 set repmemarg "" 2234 } 2235 2236 # Use different connection retry timeout values to handle any 2237 # collisions from starting sites at the same time by retrying 2238 # at different times. 2239 2240 # Open a master. 2241 puts "\tRepmgr$tnum.a: Start a master." 2242 set ma_envcmd "berkdb_env_noerr -create $verbargs \ 2243 -errpfx MASTER -home $masterdir -txn -rep -thread $repmemarg" 2244 set masterenv [eval $ma_envcmd] 2245 $masterenv repmgr -ack all -nsites $nsites \ 2246 -timeout {conn_retry 20000000} \ 2247 -local [list localhost [lindex $ports 0]] \ 2248 -start master 2249 2250 puts "\tRepmgr$tnum.b: Run some transactions at master." 2251 eval rep_test $method $masterenv NULL $niter 0 0 0 $largs 2252 2253 # Open first client 2254 puts "\tRepmgr$tnum.c: Start first client." 2255 set cl_envcmd "berkdb_env_noerr -create $verbargs \ 2256 -errpfx CLIENT -home $clientdir -txn -rep -thread $repmemarg" 2257 set clientenv [eval $cl_envcmd] 2258 $clientenv repmgr -ack all -nsites $nsites \ 2259 -timeout {conn_retry 10000000} \ 2260 -local [list localhost [lindex $ports 1]] \ 2261 -remote [list localhost [lindex $ports 0]] \ 2262 -remote [list localhost [lindex $ports 2]] \ 2263 -start client 2264 await_startup_done $clientenv 2265 2266 # 2267 # Use of -ack all guarantees replication complete before repmgr send 2268 # function returns and rep_test finishes. 2269 # 2270 puts "\tRepmgr$tnum.d: Run some more transactions at master." 2271 eval rep_test $method $masterenv NULL $niter $niter 0 0 $largs 2272 2273 # Open second client 2274 puts "\tRepmgr$tnum.e: Start second client." 2275 set cl_envcmd "berkdb_env_noerr -create $verbargs \ 2276 -errpfx CLIENT2 -home $clientdir2 -txn -rep -thread $repmemarg" 2277 set clientenv2 [eval $cl_envcmd] 2278 $clientenv2 repmgr -ack all -nsites $nsites \ 2279 -timeout {conn_retry 5000000} \ 2280 -local [list localhost [lindex $ports 2]] \ 2281 -remote [list localhost [lindex $ports 0]] \ 2282 -remote [list localhost [lindex $ports 1]] \ 2283 -start client 2284 await_startup_done $clientenv2 2285 2286 puts "\tRepmgr$tnum.f: Verifying client database contents." 2287 rep_verify $masterdir $masterenv $clientdir $clientenv 1 1 1 2288 rep_verify $masterdir $masterenv $clientdir2 $clientenv2 1 1 1 2289 2290 # For in-memory replication, verify replication files not there. 2291 if { $inmemrep } { 2292 puts "\tRepmgr$tnum.g: Verify no replication files on disk." 2293 no_rep_files_on_disk $masterdir 2294 no_rep_files_on_disk $clientdir 2295 no_rep_files_on_disk $clientdir2 2296 } 2297 2298 error_check_good client2_close [$clientenv2 close] 0 2299 error_check_good client_close [$clientenv close] 0 2300 error_check_good masterenv_close [$masterenv close] 0 2301} 2302 2303# 2304# Verify that no replication files are present in a given directory. 2305# This checks for the gen, egen, internal init, temp db and page db 2306# files. 2307# 2308proc no_rep_files_on_disk { dir } { 2309 error_check_good nogen [file exists "$dir/__db.rep.gen"] 0 2310 error_check_good noegen [file exists "$dir/__db.rep.egen"] 0 2311 error_check_good noinit [file exists "$dir/__db.rep.init"] 0 2312 error_check_good notmpdb [file exists "$dir/__db.rep.db"] 0 2313 error_check_good nopgdb [file exists "$dir/__db.reppg.db"] 0 2314} 2315 2316proc process_msgs { elist {perm_response 0} {dupp NONE} {errp NONE} \ 2317 {upg 0} } { 2318 if { $perm_response == 1 } { 2319 global perm_response_list 2320 set perm_response_list {{}} 2321 } 2322 2323 if { [string compare $dupp NONE] != 0 } { 2324 upvar $dupp dupmaster 2325 set dupmaster 0 2326 } else { 2327 set dupmaster NONE 2328 } 2329 2330 if { [string compare $errp NONE] != 0 } { 2331 upvar $errp errorp 2332 set errorp 0 2333 set var_name errorp 2334 } else { 2335 set errorp NONE 2336 set var_name NONE 2337 } 2338 2339 set upgcount 0 2340 while { 1 } { 2341 set nproced 0 2342 incr nproced [proc_msgs_once $elist dupmaster $var_name] 2343 # 2344 # If we're running the upgrade test, we are running only 2345 # our own env, we need to loop a bit to allow the other 2346 # upgrade procs to run and reply to our messages. 2347 # 2348 if { $upg == 1 && $upgcount < 10 } { 2349 tclsleep 2 2350 incr upgcount 2351 continue 2352 } 2353 if { $nproced == 0 } { 2354 break 2355 } else { 2356 set upgcount 0 2357 } 2358 } 2359} 2360 2361 2362proc proc_msgs_once { elist {dupp NONE} {errp NONE} } { 2363 global noenv_messaging 2364 2365 if { [string compare $dupp NONE] != 0 } { 2366 upvar $dupp dupmaster 2367 set dupmaster 0 2368 } else { 2369 set dupmaster NONE 2370 } 2371 2372 if { [string compare $errp NONE] != 0 } { 2373 upvar $errp errorp 2374 set errorp 0 2375 set var_name errorp 2376 } else { 2377 set errorp NONE 2378 set var_name NONE 2379 } 2380 2381 set nproced 0 2382 foreach pair $elist { 2383 set envname [lindex $pair 0] 2384 set envid [lindex $pair 1] 2385 # 2386 # If we need to send in all the other args 2387# puts "Call replpq with on $envid" 2388 if { $noenv_messaging } { 2389 incr nproced [replprocessqueue_noenv $envname $envid \ 2390 0 NONE dupmaster $var_name] 2391 } else { 2392 incr nproced [replprocessqueue $envname $envid \ 2393 0 NONE dupmaster $var_name] 2394 } 2395 # 2396 # If the user is expecting to handle an error and we get 2397 # one, return the error immediately. 2398 # 2399 if { $dupmaster != 0 && $dupmaster != "NONE" } { 2400 return 0 2401 } 2402 if { $errorp != 0 && $errorp != "NONE" } { 2403# puts "Returning due to error $errorp" 2404 return 0 2405 } 2406 } 2407 return $nproced 2408} 2409 2410proc rep_verify { masterdir masterenv clientdir clientenv \ 2411 {compare_shared_portion 0} {match 1} {logcompare 1} \ 2412 {dbname "test.db"} {datadir ""} } { 2413 global util_path 2414 global encrypt 2415 global passwd 2416 global databases_in_memory 2417 global repfiles_in_memory 2418 global env_private 2419 2420 # Whether a named database is in-memory or on-disk, only the 2421 # the name itself is passed in. Here we do the syntax adjustment 2422 # from "test.db" to { "" "test.db" } for in-memory databases. 2423 # 2424 if { $databases_in_memory && $dbname != "NULL" } { 2425 set dbname " {} $dbname " 2426 } 2427 2428 # Check locations of dbs, repfiles, region files. 2429 if { $dbname != "NULL" } { 2430 check_db_location $masterenv $dbname $datadir 2431 check_db_location $clientenv $dbname $datadir 2432 } 2433 2434 if { $repfiles_in_memory } { 2435 no_rep_files_on_disk $masterdir 2436 no_rep_files_on_disk $clientdir 2437 } 2438 if { $env_private } { 2439 no_region_files_on_disk $masterdir 2440 no_region_files_on_disk $clientdir 2441 } 2442 2443 # The logcompare flag indicates whether to compare logs. 2444 # Sometimes we run a test where rep_verify is run twice with 2445 # no intervening processing of messages. If that test is 2446 # on a build with debug_rop enabled, the master's log is 2447 # altered by the first rep_verify, and the second rep_verify 2448 # will fail. 2449 # To avoid this, skip the log comparison on the second rep_verify 2450 # by specifying logcompare == 0. 2451 # 2452 if { $logcompare } { 2453 set msg "Logs and databases" 2454 } else { 2455 set msg "Databases ($dbname)" 2456 } 2457 2458 if { $match } { 2459 puts "\t\tRep_verify: $clientdir: $msg should match" 2460 } else { 2461 puts "\t\tRep_verify: $clientdir: $msg should not match" 2462 } 2463 # Check that master and client logs and dbs are identical. 2464 2465 # Logs first, if specified ... 2466 # 2467 # If compare_shared_portion is set, run db_printlog on the log 2468 # subset that both client and master have. Either the client or 2469 # the master may have more (earlier) log files, due to internal 2470 # initialization, in-memory log wraparound, or other causes. 2471 # 2472 if { $logcompare } { 2473 error_check_good logcmp \ 2474 [logcmp $masterenv $clientenv $compare_shared_portion] 0 2475 2476 if { $dbname == "NULL" } { 2477 return 2478 } 2479 } 2480 2481 # ... now the databases. 2482 # 2483 # We're defensive here and throw an error if a database does 2484 # not exist. If opening the first database succeeded but the 2485 # second failed, we close the first before reporting the error. 2486 # 2487 if { [catch {eval {berkdb_open_noerr} -env $masterenv\ 2488 -rdonly $dbname} db1] } { 2489 error "FAIL:\ 2490 Unable to open first db $dbname in rep_verify: $db1" 2491 } 2492 if { [catch {eval {berkdb_open_noerr} -env $clientenv\ 2493 -rdonly $dbname} db2] } { 2494 error_check_good close_db1 [$db1 close] 0 2495 error "FAIL:\ 2496 Unable to open second db $dbname in rep_verify: $db2" 2497 } 2498 2499 # db_compare uses the database handles to do the comparison, and 2500 # we pass in the $mumbledir/$dbname string as a label to make it 2501 # easier to identify the offending database in case of failure. 2502 # Therefore this will work for both in-memory and on-disk databases. 2503 if { $match } { 2504 error_check_good [concat comparedbs. $dbname] [db_compare \ 2505 $db1 $db2 $masterdir/$dbname $clientdir/$dbname] 0 2506 } else { 2507 error_check_bad comparedbs [db_compare \ 2508 $db1 $db2 $masterdir/$dbname $clientdir/$dbname] 0 2509 } 2510 error_check_good db1_close [$db1 close] 0 2511 error_check_good db2_close [$db2 close] 0 2512} 2513 2514proc rep_event { env eventlist } { 2515 global startup_done 2516 global elected_event 2517 global elected_env 2518 2519 set event [lindex $eventlist 0] 2520# puts "rep_event: Got event $event on env $env" 2521 set eventlength [llength $eventlist] 2522 2523 if { $event == "startupdone" } { 2524 error_check_good event_nodata $eventlength 1 2525 set startup_done 1 2526 } 2527 if { $event == "elected" } { 2528 error_check_good event_nodata $eventlength 1 2529 set elected_event 1 2530 set elected_env $env 2531 } 2532 if { $event == "newmaster" } { 2533 error_check_good eiddata $eventlength 2 2534 set event_newmasterid [lindex $eventlist 1] 2535 } 2536 return 2537} 2538 2539# Return a list of TCP port numbers that are not currently in use on 2540# the local system. Note that this doesn't actually reserve the 2541# ports, so it's possible that by the time the caller tries to use 2542# them, another process could have taken one of them. But for our 2543# purposes that's unlikely enough that this is still useful: it's 2544# still better than trying to find hard-coded port numbers that will 2545# always be available. 2546# 2547proc available_ports { n } { 2548 set ports {} 2549 set socks {} 2550 2551 while {[incr n -1] >= 0} { 2552 set sock [socket -server Unused -myaddr localhost 0] 2553 set port [lindex [fconfigure $sock -sockname] 2] 2554 2555 lappend socks $sock 2556 lappend ports $port 2557 } 2558 2559 foreach sock $socks { 2560 close $sock 2561 } 2562 return $ports 2563} 2564 2565# Wait (a limited amount of time) for an arbitrary condition to become true, 2566# polling once per second. If time runs out we throw an error: a successful 2567# return implies the condition is indeed true. 2568# 2569proc await_condition { cond { limit 20 } } { 2570 for {set i 0} {$i < $limit} {incr i} { 2571 if {[uplevel 1 [list expr $cond]]} { 2572 return 2573 } 2574 tclsleep 1 2575 } 2576 error "FAIL: condition \{$cond\} not achieved in $limit seconds." 2577} 2578 2579proc await_startup_done { env { limit 20 } } { 2580 await_condition {[stat_field $env rep_stat "Startup complete"]} $limit 2581} 2582 2583# Wait (a limited amount of time) for an election to yield the expected 2584# environment as winner. 2585# 2586proc await_expected_master { env { limit 20 } } { 2587 await_condition {[stat_field $env rep_stat "Role"] == "master"} $limit 2588} 2589 2590proc do_leaseop { env db method key envlist { domsgs 1 } } { 2591 global alphabet 2592 2593 # 2594 # Put a txn to the database. Process messages to envlist 2595 # if directed to do so. Read data on the master, ignoring 2596 # leases (should always succeed). 2597 # 2598 set num [berkdb random_int 1 100] 2599 set data $alphabet.$num 2600 set t [$env txn] 2601 error_check_good txn [is_valid_txn $t $env] TRUE 2602 set txn "-txn $t" 2603 set ret [eval \ 2604 {$db put} $txn {$key [chop_data $method $data]}] 2605 error_check_good put $ret 0 2606 error_check_good txn [$t commit] 0 2607 2608 if { $domsgs } { 2609 process_msgs $envlist 2610 } 2611 2612 # 2613 # Now make sure we can successfully read on the master 2614 # if we ignore leases. That should always work. The 2615 # caller will do any lease related calls and checks 2616 # that are specific to the test. 2617 # 2618 set kd [$db get -nolease $key] 2619 set curs [$db cursor] 2620 set ckd [$curs get -nolease -set $key] 2621 $curs close 2622 error_check_good kd [llength $kd] 1 2623 error_check_good ckd [llength $ckd] 1 2624} 2625 2626# 2627# Get the given key, expecting status depending on whether leases 2628# are currently expected to be valid or not. 2629# 2630proc check_leaseget { db key getarg status } { 2631 set stat [catch {eval {$db get} $getarg $key} kd] 2632 if { $status != 0 } { 2633 error_check_good get_result $stat 1 2634 error_check_good kd_check \ 2635 [is_substr $kd $status] 1 2636 } else { 2637 error_check_good get_result_good $stat $status 2638 error_check_good dbkey [lindex [lindex $kd 0] 0] $key 2639 } 2640 set curs [$db cursor] 2641 set stat [catch {eval {$curs get} $getarg -set $key} kd] 2642 if { $status != 0 } { 2643 error_check_good get_result2 $stat 1 2644 error_check_good kd_check \ 2645 [is_substr $kd $status] 1 2646 } else { 2647 error_check_good get_result2_good $stat $status 2648 error_check_good dbckey [lindex [lindex $kd 0] 0] $key 2649 } 2650 $curs close 2651} 2652 2653# Simple utility to check a client database for expected values. It does not 2654# handle dup keys. 2655# 2656proc verify_client_data { env db items } { 2657 set dbp [berkdb open -env $env $db] 2658 foreach i $items { 2659 foreach {key expected_value} $i { 2660 set results [$dbp get $key] 2661 error_check_good result_length [llength $results] 1 2662 set value [lindex $results 0 1] 2663 error_check_good expected_value $value $expected_value 2664 } 2665 } 2666 $dbp close 2667} 2668 2669proc make_dbconfig { dir cnfs } { 2670 global rep_verbose 2671 2672 set f [open "$dir/DB_CONFIG" "w"] 2673 foreach line $cnfs { 2674 puts $f $line 2675 } 2676 if {$rep_verbose} { 2677 puts $f "set_verbose DB_VERB_REPLICATION" 2678 } 2679 close $f 2680} 2681 2682proc open_site_prog { cmds } { 2683 2684 set site_prog [setup_site_prog] 2685 2686 set s [open "| $site_prog" "r+"] 2687 fconfigure $s -buffering line 2688 set synced yes 2689 foreach cmd $cmds { 2690 puts $s $cmd 2691 if {[lindex $cmd 0] == "start"} { 2692 gets $s 2693 set synced yes 2694 } else { 2695 set synced no 2696 } 2697 } 2698 if {! $synced} { 2699 puts $s "echo done" 2700 gets $s 2701 } 2702 return $s 2703} 2704 2705proc setup_site_prog { } { 2706 source ./include.tcl 2707 2708 # Generate the proper executable name for the system. 2709 if { $is_windows_test } { 2710 set repsite_executable db_repsite.exe 2711 } else { 2712 set repsite_executable db_repsite 2713 } 2714 2715 # Check whether the executable exists. 2716 if { [file exists $util_path/$repsite_executable] == 0 } { 2717 error "Skipping: db_repsite executable\ 2718 not found. Is it built?" 2719 } else { 2720 set site_prog $util_path/$repsite_executable 2721 } 2722 return $site_prog 2723} 2724 2725proc next_expected_lsn { env } { 2726 return [stat_field $env rep_stat "Next LSN expected"] 2727} 2728 2729proc lsn_file { lsn } { 2730 if { [llength $lsn] != 2 } { 2731 error "not a valid LSN: $lsn" 2732 } 2733 2734 return [lindex $lsn 0] 2735} 2736 2737proc assert_rep_flag { dir flag value } { 2738 global util_path 2739 2740 set stat [exec $util_path/db_stat -N -RA -h $dir] 2741 set present [is_substr $stat $flag] 2742 error_check_good expected.flag.$flag $present $value 2743} 2744