1# See the file LICENSE for redistribution information. 2# 3# Copyright (c) 1999,2009 Oracle. All rights reserved. 4# 5# $Id$ 6# 7# TEST db_reptest 8# TEST Wrapper to configure and run the db_reptest program. 9 10# 11# TODO: 12# late client start. 13# Number of message proc threads. 14# 15 16global last_nsites 17set last_nsites 0 18 19# 20# There are 3 user-level procs that the user may invoke. 21# 1. db_reptest - Runs randomized configurations in a loop. 22# 2. basic_db_reptest - Runs a simple set configuration once, 23# as a smoke test. 24# 3. restore_db_reptest 'dir' - Runs the configuration given in 'dir' 25# in a loop. The purpose is either to reproduce a problem 26# that some configuration encountered, or test a fix. 27# 28 29# 30# db_reptest - Run a randomized configuration. Run the test 31# 'count' times in a loop, or if no count it given, it is 32# an infinite loop. 33# 34proc db_reptest { {count -1} } { 35 global rand_init 36 37 berkdb srand $rand_init 38 set cmd "db_reptest_int random" 39 db_reptest_loop $cmd $count 40} 41 42# 43# Run a basic reptest. The types are: 44# Basic 0 - Two sites, start with site 1 as master, 5 worker threads, btree, 45# run 100 seconds, onesite remote knowledge. 46# Basic 1 - Three sites, all sites start as client, 5 worker threads, btree 47# run 150 seconds, full remote knowledge. 48# 49proc basic_db_reptest { { basic 0 } } { 50 global util_path 51 52 if { [file exists $util_path/db_reptest] == 0 } { 53 puts "Skipping db_reptest. Is it built?" 54 return 55 } 56 if { $basic == 0 } { 57 db_reptest_int basic0 58 } 59 if { $basic == 1 } { 60 db_reptest_int basic1 61 } 62} 63 64# 65# Restore a configuration from the given directory and 66# run that configuration in a loop 'count' times. 67# 68proc restore_db_reptest { restoredir { count -1 } } { 69 set cmd "db_reptest_int restore $restoredir/SAVE_RUN" 70 db_reptest_loop $cmd $count 71} 72 73# 74# Wrapper to run the command in a loop, 'count' times. 75# 76proc db_reptest_loop { cmd count } { 77 global util_path 78 79 if { [file exists $util_path/db_reptest] == 0 } { 80 puts "Skipping db_reptest. Is it built?" 81 return 82 } 83 set iteration 1 84 while { 1 } { 85 puts -nonewline "ITERATION $iteration: " 86 puts [clock format [clock seconds] -format "%H:%M %D"] 87 88 # 89 eval $cmd 90 91 puts -nonewline "COMPLETED $iteration: " 92 puts [clock format [clock seconds] -format "%H:%M %D"] 93 incr iteration 94 if { $count > 0 && $iteration > $count } { 95 break 96 } 97 } 98} 99 100# 101# Internal version of db_reptest that all user-level procs 102# eventually call. It will configure a single run of 103# db_reptest based on the configuration type specified 104# in 'cfgtype'. This proc will: 105# Configure a run of db_reptest 106# Run db_reptest 107# Verify the sites after db_reptest completes. 108# 109proc db_reptest_int { cfgtype { restoredir NULL } } { 110 source ./include.tcl 111 global rporttype 112 113 env_cleanup $testdir 114 115 set savedir TESTDIR/SAVE_RUN 116 reptest_cleanup $savedir 117 118 # 119 # Get all the default or random values needed for the test 120 # and its args first. 121 # 122 set runtime 0 123 set kill 0 124 # 125 # Get number of sites first because pretty much everything else 126 # after here depends on how many sites there are. 127 # 128 set num_sites [get_nsites $cfgtype $restoredir] 129 set use_lease [get_lease $cfgtype $restoredir] 130 # 131 # Only use kill if we have > 2 sites. 132 # Returns the site number of the site to kill, or 0 133 # if this will not be a kill test. 134 # 135 if { $num_sites > 2 } { 136 set kill [get_kill $cfgtype $restoredir $num_sites] 137 } 138 if { $cfgtype != "restore" } { 139 if { $use_lease } { 140 set use_master 0 141 } else { 142 set use_master [get_usemaster $cfgtype] 143 } 144 set master_site [get_mastersite $cfgtype $use_master $num_sites] 145 set workers [get_workers $cfgtype $use_lease] 146 set dbtype [get_dbtype $cfgtype] 147 set runtime [get_runtime $cfgtype] 148 set use_peers [get_peers $cfgtype] 149 puts -nonewline "Running: $num_sites sites, $runtime seconds " 150 if { $kill } { 151 puts -nonewline "kill site $kill " 152 } 153 if { $use_lease } { 154 puts "with leases" 155 } elseif { $use_master } { 156 puts "master site $master_site" 157 } else { 158 puts "no master" 159 } 160 } 161 set baseport 6100 162 set rporttype NULL 163 # 164 # This loop sets up the args to the invocation of db_reptest 165 # for each site. 166 # 167 for { set i 1 } {$i <= $num_sites } { incr i } { 168 set envdirs($i) TESTDIR/ENV$i 169 reptest_cleanup $envdirs($i) 170 # 171 # If we are restoring the args, just read them from the 172 # saved location for this sites. Otherwise build up 173 # the args for each piece we need. 174 # 175 if { $cfgtype == "restore" } { 176 set cid [open $restoredir/DB_REPTEST_ARGS.$i r] 177 set prog_args($i) [read $cid] 178 close $cid 179 if { $runtime == 0 } { 180 set runtime [parse_runtime $prog_args($i)] 181 puts "Runtime: $runtime" 182 } 183 } else { 184 set prog_args($i) \ 185 "-v -c $workers -t $dbtype -T $runtime " 186 set prog_args($i) \ 187 [concat $prog_args($i) "-h $envdirs($i)"] 188 # 189 # Add in if this site should kill itself. 190 # 191 if { $kill == $i } { 192 set prog_args($i) [concat $prog_args($i) "-k"] 193 } 194 # 195 # Add in if this site starts as a master or client. 196 # 197 if { $i == $master_site } { 198 set state($i) MASTER 199 set prog_args($i) [concat $prog_args($i) "-M"] 200 } else { 201 set state($i) CLIENT 202 # 203 # If we have a master, then we just want to 204 # start as a client. Otherwise start with 205 # elections. 206 # 207 if { $use_master } { 208 set prog_args($i) \ 209 [concat $prog_args($i) "-C"] 210 } else { 211 set prog_args($i) \ 212 [concat $prog_args($i) "-E"] 213 } 214 } 215 # 216 # Add in host:port configuration, both this site's 217 # local address and any remote addresses it knows. 218 # 219 set lport($i) [expr $baseport + $i] 220 set prog_args($i) \ 221 [concat $prog_args($i) "-l localhost:$lport($i)"] 222 set rport($i) [get_rport $baseport $i \ 223 $num_sites $cfgtype] 224 if { $use_peers } { 225 set remote_arg "-R" 226 } else { 227 set remote_arg "-r" 228 } 229 foreach p $rport($i) { 230 set prog_args($i) \ 231 [concat $prog_args($i) $remote_arg \ 232 "localhost:$p"] 233 } 234 } 235 save_db_reptest $savedir ARGS $i $prog_args($i) 236 } 237 238 # Now make the DB_CONFIG file for each site. 239 reptest_make_config $savedir $num_sites envdirs state \ 240 $use_lease $cfgtype $restoredir 241 242 # Run the test 243 run_db_reptest $savedir $num_sites $runtime 244 puts "Test run complete. Verify." 245 246 # Verify the test run. 247 verify_db_reptest $num_sites envdirs $kill 248 249} 250 251# 252# Make a DB_CONFIG file for all sites in the group 253# 254proc reptest_make_config { savedir nsites edirs st lease cfgtype restoredir } { 255 upvar $edirs envdirs 256 upvar $st state 257 258 # 259 # Generate global config values that should be the same 260 # across all sites, such as number of sites and log size, etc. 261 # 262 set default_cfglist { 263 { "rep_set_nsites" $nsites } 264 { "rep_set_request" "150000 2400000" } 265 { "rep_set_timeout" "db_rep_checkpoint_delay 0" } 266 { "rep_set_timeout" "db_rep_connection_retry 2000000" } 267 { "rep_set_timeout" "db_rep_heartbeat_monitor 1000000" } 268 { "rep_set_timeout" "db_rep_heartbeat_send 500000" } 269 { "set_cachesize" "0 536870912 1" } 270 { "set_lg_max" "131072" } 271 { "set_lk_detect" "db_lock_default" } 272 { "set_verbose" "db_verb_recovery" } 273 { "set_verbose" "db_verb_replication" } 274 } 275 276 set acks { db_repmgr_acks_all db_repmgr_acks_all_peers \ 277 db_repmgr_acks_none db_repmgr_acks_one db_repmgr_acks_one_peer \ 278 db_repmgr_acks_quorum } 279 280 # 281 # Ack policy must be the same on all sites. 282 # 283 if { $cfgtype == "random" } { 284 if { $lease } { 285 set ackpolicy db_repmgr_acks_quorum 286 } else { 287 set done 0 288 while { $done == 0 } { 289 set acksz [expr [llength $acks] - 1] 290 set myack [berkdb random_int 0 $acksz] 291 set ackpolicy [lindex $acks $myack] 292 # 293 # Only allow the "none" policy with 2 sites 294 # otherwise it can overwhelm the system and 295 # it is a rarely used option. 296 # 297 if { $ackpolicy == "db_repmgr_acks_none" && \ 298 $nsites > 2 } { 299 continue 300 } 301 set done 1 302 } 303 } 304 } else { 305 set ackpolicy db_repmgr_acks_one 306 } 307 for { set i 1 } { $i <= $nsites } { incr i } { 308 # 309 # If we're restoring we just need to copy it. 310 # 311 if { $cfgtype == "restore" } { 312 file copy $restoredir/DB_CONFIG.$i \ 313 $envdirs($i)/DB_CONFIG 314 file copy $restoredir/DB_CONFIG.$i \ 315 $savedir/DB_CONFIG.$i 316 continue 317 } 318 # 319 # Otherwise set up per-site config information 320 # 321 set cfglist $default_cfglist 322 323 # 324 # Add lease configuration if needed. We're running all 325 # locally, so there is no clock skew. 326 # 327 if { $lease } { 328 # 329 # We need to have an ack timeout > lease timeout. 330 # Otherwise txns can get committed without waiting 331 # long enough for leases to get granted. 332 # 333 lappend cfglist { "rep_set_config" "db_rep_conf_lease" } 334 lappend cfglist { "rep_set_timeout" \ 335 "db_rep_lease_timeout 10000000" } 336 lappend cfglist \ 337 { "rep_set_timeout" "db_rep_ack_timeout 20000000" } 338 } else { 339 lappend cfglist \ 340 { "rep_set_timeout" "db_rep_ack_timeout 5000000" } 341 } 342 343 # 344 # Priority 345 # 346 if { $state($i) == "MASTER" } { 347 lappend cfglist { "rep_set_priority" 100 } 348 } else { 349 if { $cfgtype == "random" } { 350 set pri [berkdb random_int 10 25] 351 } else { 352 set pri 20 353 } 354 lappend cfglist { "rep_set_priority" $pri } 355 } 356 # 357 # Others: limit size, bulk, 2site strict, 358 # 359 if { $cfgtype == "random" } { 360 set limit_sz [berkdb random_int 15000 1000000] 361 set bulk [berkdb random_int 0 1] 362 if { $bulk } { 363 lappend cfglist \ 364 { "rep_set_config" "db_rep_conf_bulk" } 365 } 366 if { $nsites == 2 } { 367 set strict [berkdb random_int 0 1] 368 if { $strict } { 369 lappend cfglist { "rep_set_config" \ 370 "db_repmgr_conf_2site_strict" } 371 } 372 } 373 } else { 374 set limit_sz 100000 375 } 376 lappend cfglist { "rep_set_limit" "0 $limit_sz" } 377 lappend cfglist { "repmgr_set_ack_policy" $ackpolicy } 378 set cid [open $envdirs($i)/DB_CONFIG a] 379 foreach c $cfglist { 380 set carg [subst [lindex $c 0]] 381 set cval [subst [lindex $c 1]] 382 puts $cid "$carg $cval" 383 } 384 close $cid 385 set cid [open $envdirs($i)/DB_CONFIG r] 386 set cfg [read $cid] 387 close $cid 388 389 save_db_reptest $savedir CONFIG $i $cfg 390 } 391 392} 393 394proc reptest_cleanup { dir } { 395 # 396 # For now, just completely remove it all. We might want 397 # to use env_cleanup at some point in the future. 398 # 399 fileremove -f $dir 400 file mkdir $dir 401} 402 403 404proc save_db_reptest { savedir op site savelist } { 405 # 406 # Save a copy of the configuration and args used to run this 407 # instance of the test. 408 # 409 if { $op == "CONFIG" } { 410 set outfile $savedir/DB_CONFIG.$site 411 } else { 412 set outfile $savedir/DB_REPTEST_ARGS.$site 413 } 414 set cid [open $outfile a] 415 puts -nonewline $cid $savelist 416 close $cid 417} 418 419proc run_db_reptest { savedir numsites runtime } { 420 source ./include.tcl 421 global killed_procs 422 423 set pids {} 424 for {set i 1} {$i <= $numsites} {incr i} { 425 lappend pids [exec $tclsh_path $test_path/wrap_reptest.tcl \ 426 $savedir/DB_REPTEST_ARGS.$i $savedir/site$i.log &] 427 tclsleep 1 428 } 429 watch_procs $pids 15 [expr $runtime * 3] 430 set killed [llength $killed_procs] 431 if { $killed > 0 } { 432 error "Processes $killed_procs never finished" 433 } 434} 435 436proc verify_db_reptest { num_sites edirs kill } { 437 upvar $edirs envdirs 438 439 set startenv 1 440 set cmpeid 2 441 if { $kill == 1 } { 442 set startenv 2 443 set cmpeid 3 444 } 445 set envbase [berkdb_env_noerr -home $envdirs($startenv)] 446 for { set i $cmpeid } { $i <= $num_sites } { incr i } { 447 if { $i == $kill } { 448 continue 449 } 450 set cmpenv [berkdb_env_noerr -home $envdirs($i)] 451 puts "Compare $envdirs($startenv) with $envdirs($i)" 452 # 453 # Compare 2 envs. We assume the name of the database that 454 # db_reptest creates and know it is 'am1.db'. 455 # We want as other args: 456 # 0 - compare_shared_portion 457 # 1 - match databases 458 # 0 - don't compare logs (for now) 459 rep_verify $envdirs($startenv) $envbase $envdirs($i) $cmpenv \ 460 0 1 0 am1.db 461 $cmpenv close 462 } 463 $envbase close 464} 465 466proc get_nsites { cfgtype restoredir } { 467 global last_nsites 468 469 # 470 # The number of sites must be the same for all. Read the 471 # first site's saved DB_CONFIG file if we're restoring since 472 # we only know we have at least 1 site. 473 # 474 if { $cfgtype == "restore" } { 475 set cid [open $restoredir/DB_CONFIG.1 r] 476 while { [gets $cid cfglist] } { 477 puts "Read in: $cfglist" 478 set cfg [lindex $cfglist 0] 479 if { $cfg == "rep_set_nsites" } { 480 set num_sites [lindex $cfglist 1] 481 break; 482 } 483 } 484 close $cid 485 return $num_sites 486 } 487 if { $cfgtype == "random" } { 488 # 489 # Sometimes 'random' doesn't seem to do a good job. I have 490 # seen on all iterations after the first one, nsites is 491 # always 2, 100% of the time. Add this bit to make sure 492 # this nsites values is different from the last iteration. 493 # 494 set n [berkdb random_int 2 5] 495 while { $n == $last_nsites } { 496 set n [berkdb random_int 2 5] 497puts "Getting random nsites between 2 and 5. Got $n, last_nsites $last_nsites" 498 } 499 set last_nsites $n 500 return $n 501# return [berkdb random_int 2 5] 502 } 503 if { $cfgtype == "basic0" } { 504 return 2 505 } 506 if { $cfgtype == "basic1" } { 507 return 3 508 } 509 return -1 510} 511 512# 513# Run with master leases? 25%/75% (use a master lease 25% of the time). 514# 515proc get_lease { cfgtype restoredir } { 516 # 517 # The number of sites must be the same for all. Read the 518 # first site's saved DB_CONFIG file if we're restoring since 519 # we only know we have at least 1 site. 520 # 521 if { $cfgtype == "restore" } { 522 set use_lease 0 523 set cid [open $restoredir/DB_CONFIG.1 r] 524 while { [gets $cid cfglist] } { 525# puts "Read in: $cfglist" 526 if { [llength $cfglist] == 0 } { 527 break; 528 } 529 set cfg [lindex $cfglist 0] 530 if { $cfg == "rep_set_config" } { 531 set lease [lindex $cfglist 1] 532 if { $lease == "db_rep_conf_lease" } { 533 set use_lease 1 534 break; 535 } 536 } 537 } 538 close $cid 539 return $use_lease 540 } 541 if { $cfgtype == "random" } { 542 set leases { 1 0 0 0 } 543 set len [expr [llength $leases] - 1] 544 set i [berkdb random_int 0 $len] 545 return [lindex $leases $i] 546 } 547 if { $cfgtype == "basic0" } { 548 return 0 549 } 550 if { $cfgtype == "basic1" } { 551 return 0 552 } 553} 554 555# 556# Do a kill test about half the time. We randomly choose a 557# site number to kill, it could be a master or a client. 558# Return 0 if we don't kill any site. 559# 560proc get_kill { cfgtype restoredir num_sites } { 561 if { $cfgtype == "restore" } { 562 set ksite 0 563 for { set i 1 } { $i <= $num_sites } { incr i } { 564 set cid [open $restoredir/DB_REPTEST_ARGS.$i r] 565 # 566 # !!! 567 # We currently assume the args file is 1 line. 568 # We assume only 1 site can get killed. So, if we 569 # find one, we break the loop and don't look further. 570 # 571 gets $cid arglist 572 close $cid 573# puts "Read in: $arglist" 574 set dokill [lsearch $arglist "-k"] 575 if { $dokill != -1 } { 576 set ksite $i 577 break 578 } 579 } 580 return $ksite 581 } 582 if { $cfgtype == "random" } { 583 set k { 0 0 0 1 1 1 0 1 1 0 } 584 set len [expr [llength $k] - 1] 585 set i [berkdb random_int 0 $len] 586 if { [lindex $k $i] == 1 } { 587 set ksite [berkdb random_int 1 $num_sites] 588 } else { 589 set ksite 0 590 } 591 return $ksite 592 } 593 if { $cfgtype == "basic0" || $cfgtype == "basic1" } { 594 return 0 595 } else { 596 error "Get_kill: Invalid config type $cfgtype" 597 } 598} 599 600# 601# Use peers or only the master for requests? 25%/75% (use a peer 25% 602# of the time and master 75%) 603# 604proc get_peers { cfgtype } { 605 if { $cfgtype == "random" } { 606 set peer { 0 0 0 1 } 607 set len [expr [llength $peer] - 1] 608 set i [berkdb random_int 0 $len] 609 return [lindex $peer $i] 610 } 611 if { $cfgtype == "basic0" || $cfgtype == "basic1" } { 612 return 0 613 } 614} 615 616# 617# Start with a master or all clients? 25%/75% (use a master 25% 618# of the time and have all clients 75%) 619# 620proc get_usemaster { cfgtype } { 621 if { $cfgtype == "random" } { 622 set mst { 1 0 0 0 } 623 set len [expr [llength $mst] - 1] 624 set i [berkdb random_int 0 $len] 625 return [lindex $mst $i] 626 } 627 if { $cfgtype == "basic0" } { 628 return 1 629 } 630 if { $cfgtype == "basic1" } { 631 return 0 632 } 633} 634 635# 636# If we use a master, which site? This proc will return 637# the site number of the mastersite, or it will return 638# 0 if no site should start as master. Sites are numbered 639# starting at 1. 640# 641proc get_mastersite { cfgtype usemaster nsites } { 642 if { $usemaster == 0 } { 643 return 0 644 } 645 if { $cfgtype == "random" } { 646 return [berkdb random_int 1 $nsites] 647 } 648 if { $cfgtype == "basic0" } { 649 return 1 650 } 651 if { $cfgtype == "basic1" } { 652 return 0 653 } 654} 655 656# 657# This is the number of worker threads performing the workload. 658# This is not the number of message processing threads. 659# 660# Scale back the number of worker threads if leases are in use. 661# The timing with leases can be fairly sensitive and since all sites 662# run on the local machine, too many workers on every site can 663# overwhelm the system, causing lost messages and delays that make 664# the tests fail. Rather than try to tweak timeouts, just reduce 665# the workloads a bit. 666# 667proc get_workers { cfgtype lease } { 668 if { $cfgtype == "random" } { 669 if { $lease } { 670 return [berkdb random_int 2 4] 671 } else { 672 return [berkdb random_int 2 8] 673 } 674 } 675 if { $cfgtype == "basic0" || $cfgtype == "basic1" } { 676 return 5 677 } 678} 679 680proc get_dbtype { cfgtype } { 681 if { $cfgtype == "random" } { 682 # 683 # 50% btree, 25% queue 12.5% hash 12.5% recno 684 # We favor queue only because there is special handling 685 # for queue in internal init. 686 # 687# set methods {btree btree btree btree queue queue hash recno} 688 set methods {btree btree btree btree hash recno} 689 set len [expr [llength $methods] - 1] 690 set i [berkdb random_int 0 $len] 691 return [lindex $methods $i] 692 } 693 if { $cfgtype == "basic0" || $cfgtype == "basic1" } { 694 return btree 695 } 696} 697 698proc get_runtime { cfgtype } { 699 if { $cfgtype == "random" } { 700 return [berkdb random_int 100 500] 701 } 702 if { $cfgtype == "basic0" } { 703 return 100 704 } 705 if { $cfgtype == "basic1" } { 706 return 150 707 } 708} 709 710proc get_rport { baseport i num_sites cfgtype} { 711 global rporttype 712 713 if { $cfgtype == "random" && $rporttype == "NULL" } { 714 # 715 # The circular comm choices seem problematic. 716 # Remove them for now. 717 # 718# set types {backcirc forwcirc full onesite} 719 set types {full onesite} 720 set len [expr [llength $types] - 1] 721 set rindex [berkdb random_int 0 $len] 722 set rporttype [lindex $types $rindex] 723 } 724 if { $cfgtype == "basic0" } { 725 set rporttype onesite 726 } 727 if { $cfgtype == "basic1" } { 728 set rporttype full 729 } 730 # 731 # This produces a circular knowledge ring. Either forward 732 # or backward. In the forwcirc, ENV1 knows (via -r) about 733 # ENV2, ENV2 knows about ENV3, ..., ENVX knows about ENV1. 734 # 735 if { $rporttype == "forwcirc" } { 736 if { $i != $num_sites } { 737 return [list [expr $baseport + $i + 1]] 738 } else { 739 return [list [expr $baseport + 1]] 740 } 741 } 742 if { $rporttype == "backcirc" } { 743 if { $i != 1 } { 744 return [list [expr $baseport + $i - 1]] 745 } else { 746 return [list [expr $baseport + $num_sites]] 747 } 748 } 749 # 750 # This produces a configuration where site 1 does not know 751 # about any other site and every other site knows about site 1. 752 # 753 if { $rporttype == "onesite" } { 754 if { $i == 1 } { 755 return {} 756 } else { 757 return [list [expr $baseport + 1]] 758 } 759 } 760 # 761 # This produces a fully connected configuration 762 # 763 if { $rporttype == "full" } { 764 set rlist {} 765 for { set site 1 } { $site <= $num_sites } { incr site } { 766 if { $site != $i } { 767 lappend rlist [expr $baseport + $site] 768 } 769 } 770 return $rlist 771 } 772} 773 774proc parse_runtime { progargs } { 775 set i [lsearch $progargs "-T"] 776 set val [lindex $progargs [expr $i + 1]] 777 return $val 778} 779