1# See the file LICENSE for redistribution information.
2#
3# Copyright (c) 2004-2009 Oracle.  All rights reserved.
4#
5# $Id$
6#
7# TEST	rep039
8# TEST	Test of interrupted internal initialization.  The
9# TEST	interruption is due to a changed master, or the client crashing,
10# TEST	or both.
11# TEST
12# TEST	One master, two clients.
13# TEST	Generate several log files. Remove old master log files.
14# TEST	Restart client, optionally having "cleaned" client env dir.  Either
15# TEST	way, this has the effect of forcing an internal init.
16# TEST	Interrupt the internal init.
17# TEST	Vary the number of times we process messages to make sure
18# TEST	the interruption occurs at varying stages of the first internal
19# TEST	initialization.
20# TEST
21# TEST	Run for btree and queue only because of the number of permutations.
22# TEST
23proc rep039 { method { niter 200 } { tnum "039" } args } {
24
25	source ./include.tcl
26	global databases_in_memory
27	global repfiles_in_memory
28
29	# Run for btree and queue methods only.
30	if { $checking_valid_methods } {
31		set test_methods {}
32		foreach method $valid_methods {
33			if { [is_btree $method] == 1 || \
34			    [is_queue $method] == 1 } {
35				lappend test_methods $method
36			}
37		}
38		return $test_methods
39	}
40	if { [is_btree $method] == 0 && [is_queue $method] == 0 } {
41		puts "Rep$tnum: skipping for non-btree, non-queue method."
42		return
43	}
44
45	# Skip for mixed-mode logging -- this test has a very large
46	# set of iterations already.
47	global mixed_mode_logging
48	if { $mixed_mode_logging > 0 } {
49		puts "Rep$tnum: Skipping for mixed mode logging."
50		return
51	}
52
53	# This test needs to set its own pagesize.
54	set pgindex [lsearch -exact $args "-pagesize"]
55	if { $pgindex != -1 } {
56		puts "Rep$tnum: skipping for specific pagesizes"
57		return
58	}
59
60	set args [convert_args $method $args]
61
62	# Set up for on-disk or in-memory databases.
63	set msg "using on-disk databases"
64	if { $databases_in_memory } {
65		set msg "using named in-memory databases"
66		if { [is_queueext $method] } {
67			puts -nonewline "Skipping rep$tnum for method "
68			puts "$method with named in-memory databases."
69			return
70		}
71	}
72
73	set msg2 "and on-disk replication files"
74	if { $repfiles_in_memory } {
75		set msg2 "and in-memory replication files"
76	}
77
78	# Run the body of the test with and without recovery,
79	# and with and without cleaning.
80	set cleanopts { noclean clean }
81	set archopts { archive noarchive }
82	set nummsgs 4
83	set announce {puts "Rep$tnum ($method $r $clean $a $crash $l $args):\
84            Test of internal init. $i message iters. \
85	    Test $cnt of $maxtest tests $with recovery $msg $msg2."}
86	foreach r $test_recopts {
87		if { $r == "-recover" && ! $is_windows_test && ! $is_hp_test } {
88			set crashopts { master_change client_crash both }
89		} else {
90			set crashopts { master_change }
91		}
92		# Only one of the three sites in the replication group needs to
93		# be tested with in-memory logs: the "client under test".
94		#
95		if { $r == "-recover" } {
96			set cl_logopts { on-disk }
97			set with "with"
98		} else {
99			set cl_logopts { on-disk in-memory }
100			set with "without"
101		}
102		set maxtest [expr [llength $crashopts] * \
103		    [llength $cleanopts] * \
104		    [llength $archopts] * \
105		    [llength $cl_logopts] * \
106		    [expr $nummsgs]]
107		set cnt 1
108		foreach crash $crashopts {
109			foreach clean $cleanopts {
110				foreach a $archopts {
111					foreach l $cl_logopts {
112						for { set i 1 } \
113						    { $i <= $nummsgs } \
114						    { incr i } {
115							eval $announce
116							rep039_sub $method \
117							    $niter $tnum $r \
118							    $clean $a $crash \
119							    $l $i $args
120							incr cnt
121						}
122					}
123				}
124			}
125		}
126	}
127}
128
129proc rep039_sub \
130    { method niter tnum recargs clean archive crash cl_logopt pmsgs largs } {
131	global testdir
132	global util_path
133	global databases_in_memory
134	global repfiles_in_memory
135	global rep_verbose
136	global verbose_type
137
138	set verbargs ""
139	if { $rep_verbose == 1 } {
140		set verbargs " -verbose {$verbose_type on} "
141	}
142
143	set repmemargs ""
144	if { $repfiles_in_memory } {
145		set repmemargs "-rep_inmem_files "
146	}
147
148	set master_change false
149	set client_crash false
150	if { $crash == "master_change" } {
151		set master_change true
152	} elseif { $crash == "client_crash" } {
153		set client_crash true
154	} elseif { $crash == "both" } {
155		set master_change true
156		set client_crash true
157	} else {
158		error "FAIL:[timestamp] '$crash' is an unrecognized crash type"
159	}
160
161	env_cleanup $testdir
162
163	replsetup $testdir/MSGQUEUEDIR
164
165	# This test has three replication sites: a master, a client whose
166	# behavior is under test, and another client.  We'll call them
167	# "A", "B" and "C".  At one point during the test, we may (depending on
168	# the setting of $master_change) switch roles between the master and the
169	# other client.
170	#
171	# The initial site/role assignments are as follows:
172	#
173	#     A = master
174	#     B = client under test
175	#     C = other client
176	#
177	# In the case where we do switch roles, the roles become:
178	#
179	#     A = other client
180	#     B = client under test (no change here)
181	#     C = master
182	#
183	# Although the real names are A, B, and C, we'll use mnemonic names
184	# whenever possible.  In particular, this means that we'll have to
185	# re-jigger the mnemonic names after the role switch.
186
187	file mkdir [set dirs(A) $testdir/SITE_A]
188	file mkdir [set dirs(B) $testdir/SITE_B]
189	file mkdir [set dirs(C) $testdir/SITE_C]
190
191	# Log size is small so we quickly create more than one.
192	# The documentation says that the log file must be at least
193	# four times the size of the in-memory log buffer.
194	set pagesize 4096
195	append largs " -pagesize $pagesize "
196	set log_buf [expr $pagesize * 2]
197	set log_max [expr $log_buf * 4]
198
199	# Set up the three sites: A, B, and C will correspond to EID's
200	# 1, 2, and 3 in the obvious way.  As we start out, site A is always the
201	# master.
202	#
203	repladd 1
204	set env_A_cmd "berkdb_env_noerr -create -txn nosync \
205	    $verbargs $repmemargs \
206	    -log_buffer $log_buf -log_max $log_max -errpfx SITE_A \
207	    -home $dirs(A) -rep_transport \[list 1 replsend\]"
208	set envs(A) [eval $env_A_cmd $recargs -rep_master]
209
210	# Open a client
211	repladd 2
212	set txn_arg [adjust_txnargs $cl_logopt]
213	set log_arg [adjust_logargs $cl_logopt]
214        if { $cl_logopt == "on-disk" } {
215		# Override in this case, because we want to specify log_buffer.
216		set log_arg "-log_buffer $log_buf"
217	}
218	set env_B_cmd "berkdb_env_noerr -create $txn_arg \
219	    $verbargs $repmemargs \
220	    $log_arg -log_max $log_max -errpfx SITE_B \
221	    -home $dirs(B) -rep_transport \[list 2 replsend\]"
222	set envs(B) [eval $env_B_cmd $recargs -rep_client]
223
224	# Open 2nd client
225	repladd 3
226	set env_C_cmd "berkdb_env_noerr -create -txn nosync \
227	    $verbargs $repmemargs \
228	    -log_buffer $log_buf -log_max $log_max -errpfx SITE_C \
229	    -home $dirs(C) -rep_transport \[list 3 replsend\]"
230	set envs(C) [eval $env_C_cmd $recargs -rep_client]
231
232	# Turn off throttling for this test.
233	foreach site [array names envs] {
234		$envs($site) rep_limit 0 0
235	}
236
237	# Bring the clients online by processing the startup messages.
238	set envlist "{$envs(A) 1} {$envs(B) 2} {$envs(C) 3}"
239	process_msgs $envlist
240
241	# Set up the (indirect) mnemonic role names for the first part of the
242	# test.
243	set master A
244	set test_client B
245	set other C
246
247	# Clobber replication's 30-second anti-archive timer, which will have
248	# been started by client sync-up internal init, so that we can do a
249	# log_archive in a moment.
250	#
251	$envs($master) test force noarchive_timeout
252
253	# Run rep_test in the master (and update client).
254	puts "\tRep$tnum.a: Running rep_test in replicated env."
255	eval rep_test $method $envs($master) NULL $niter 0 0 0 $largs
256	process_msgs $envlist
257
258	puts "\tRep$tnum.b: Close client."
259	error_check_good client_close [$envs($test_client) close] 0
260
261	set res [eval exec $util_path/db_archive -l -h $dirs($test_client)]
262	set last_client_log [lindex [lsort $res] end]
263
264	set stop 0
265	while { $stop == 0 } {
266		# Run rep_test in the master (don't update client).
267		puts "\tRep$tnum.c: Running rep_test in replicated env."
268		eval rep_test $method $envs($master) NULL $niter 0 0 0 $largs
269		#
270		# Clear messages for first client.  We want that site
271		# to get far behind.
272		#
273		replclear 2
274		puts "\tRep$tnum.d: Run db_archive on master."
275		set res [eval exec $util_path/db_archive -d -h $dirs($master)]
276		set res [eval exec $util_path/db_archive -l -h $dirs($master)]
277		if { [lsearch -exact $res $last_client_log] == -1 } {
278			set stop 1
279		}
280	}
281
282	set envlist "{$envs($master) 1} {$envs($other) 3}"
283	process_msgs $envlist
284
285	if { $archive == "archive" } {
286		puts "\tRep$tnum.d: Run db_archive on other client."
287		set res [eval exec $util_path/db_archive -l -h $dirs($other)]
288		error_check_bad \
289		    log.1.present [lsearch -exact $res log.0000000001] -1
290		set res [eval exec $util_path/db_archive -d -h $dirs($other)]
291		set res [eval exec $util_path/db_archive -l -h $dirs($other)]
292		error_check_good \
293		    log.1.gone [lsearch -exact $res log.0000000001] -1
294	} else {
295		puts "\tRep$tnum.d: Skipping db_archive on other client."
296	}
297
298	puts "\tRep$tnum.e: Reopen test client ($clean)."
299	if { $clean == "clean" } {
300		env_cleanup $dirs($test_client)
301	}
302
303	# (The test client is always site B, EID 2.)
304	#
305	set envs(B) [eval $env_B_cmd $recargs -rep_client]
306	error_check_good client_env [is_valid_env $envs(B)] TRUE
307	$envs(B) rep_limit 0 0
308
309	# Hold an open database handle while doing internal init, to make sure
310	# no back lock interactions are happening.  But only do so some of the
311	# time, and of course only if it's reasonable to expect the database to
312	# exist at this point.  (It won't, if we're using in-memory databases
313	# and we've just started the client with recovery, since recovery blows
314	# away the mpool.)  Set up database as in-memory or on-disk first.
315	#
316	if { $databases_in_memory } {
317		set dbname { "" "test.db" }
318		set have_db [expr {$recargs != "-recover"}]
319	} else {
320		set dbname "test.db"
321		set have_db true
322	}
323
324	if {$clean == "noclean" && $have_db && [berkdb random_int 0 1] == 1} {
325		puts "\tRep$tnum.g: Hold open db handle from client app."
326		set cdb [eval\
327		    {berkdb_open_noerr -env} $envs($test_client) $dbname]
328		error_check_good dbopen [is_valid_db $cdb] TRUE
329		set ccur [$cdb cursor]
330		error_check_good curs [is_valid_cursor $ccur $cdb] TRUE
331		set ret [$ccur get -first]
332		set kd [lindex $ret 0]
333		set key [lindex $kd 0]
334		error_check_good cclose [$ccur close] 0
335	} else {
336		puts "\tRep$tnum.g: (No client app handle will be held.)"
337		set cdb "NONE"
338	}
339
340	set envlist "{$envs(A) 1} {$envs(B) 2} {$envs(C) 3}"
341	proc_msgs_once $envlist
342
343	#
344	# We want to simulate a master continually getting new
345	# records while an update is going on.
346	#
347	set entries 10
348	eval rep_test $method $envs($master) NULL $entries $niter 0 0 $largs
349	#
350	# We call proc_msgs_once N times to get us into page recovery:
351	# 1.  Send master messages and client finds master.
352	# 2.  Master replies and client does verify.
353	# 3.  Master gives verify_fail and client does update_req.
354	# 4.  Master send update info and client does page_req.
355	#
356	# We vary the number of times we call proc_msgs_once (via pmsgs)
357	# so that we test switching master at each point in the
358	# internal initialization processing.
359	#
360	set nproced 0
361	puts "\tRep$tnum.f: Get partially through initialization ($pmsgs iters)"
362	for { set i 1 } { $i < $pmsgs } { incr i } {
363		incr nproced [proc_msgs_once $envlist]
364	}
365
366	if { [string is true $master_change] } {
367		replclear 1
368		replclear 3
369		puts "\tRep$tnum.g: Downgrade/upgrade master."
370
371		# Downgrade the existing master to a client, switch around the
372		# roles, and then upgrade the newly appointed master.
373		error_check_good downgrade [$envs($master) rep_start -client] 0
374
375		set master C
376		set other A
377
378		error_check_good upgrade [$envs($master) rep_start -master] 0
379	}
380
381	# Simulate a client crash: simply abandon the handle without closing it.
382	# Note that this doesn't work on Windows, because there you can't remove
383	# a file if anyone (including yourself) has it open.  This also does not
384	# work on HP-UX, because there you are not allowed to open a second
385	# handle on an env.
386	#
387	# Note that crashing only makes sense with "-recover".
388	#
389	if { [string is true $client_crash] } {
390		error_check_good assert [string compare $recargs "-recover"] 0
391
392		set abandoned_env $envs($test_client)
393		set abandoned true
394
395		set envs($test_client) [eval $env_B_cmd $recargs -rep_client]
396		$envs($test_client) rep_limit 0 0
397
398		# Again, remember: whatever the current roles, a site and its EID
399		# stay linked always.
400		#
401		set envlist "{$envs(A) 1} {$envs(B) 2} {$envs(C) 3}"
402	} else {
403		set abandoned false
404	}
405
406	process_msgs $envlist
407	#
408	# Now simulate continual updates to the new master.  Each
409	# time through we just process messages once before
410	# generating more updates.
411	#
412	set niter 10
413	for { set i 0 } { $i < $niter } { incr i } {
414		set nproced 0
415		set start [expr $i * $entries]
416		eval rep_test $method $envs($master) NULL $entries $start \
417		    $start 0 $largs
418		incr nproced [proc_msgs_once $envlist]
419		error_check_bad nproced $nproced 0
420	}
421	set start [expr $i * $entries]
422	process_msgs $envlist
423
424	puts "\tRep$tnum.h: Verify logs and databases"
425	# Whether or not we've switched roles, it's always site A that may have
426	# had its logs archived away.  When the $init_test flag is turned on,
427	# rep_verify allows the site in the second position to have
428	# (more-)archived logs, so we have to abuse the calling signature a bit
429	# here to get this to work.  (I.e., even when A is still master and C is
430	# still the other client, we have to pass things in this order so that
431	# the $init_test different-sized-logs trick can work.)
432	#
433	set init_test 1
434	rep_verify $dirs(C) $envs(C) $dirs(A) $envs(A) $init_test
435
436	# Process messages again in case we are running with debug_rop.
437	process_msgs $envlist
438	rep_verify $dirs($master) $envs($master) \
439	    $dirs($test_client) $envs($test_client) $init_test
440
441	# Add records to the master and update client.
442	puts "\tRep$tnum.i: Add more records and check again."
443	set entries 10
444	eval rep_test $method $envs($master) NULL $entries $start \
445	    $start 0 $largs
446	process_msgs $envlist 0 NONE err
447
448	# Check again that everyone is identical.
449	rep_verify $dirs(C) $envs(C) $dirs(A) $envs(A) $init_test
450	process_msgs $envlist
451	rep_verify $dirs($master) $envs($master) \
452	    $dirs($test_client) $envs($test_client) $init_test
453
454	if {$cdb != "NONE"} {
455		if {$abandoned} {
456			# The $cdb was opened in an env which was then
457			# abandoned, recovered, marked panic'ed.  We don't
458			# really care; we're just trying to clean up resources.
459			#
460			catch {$cdb close}
461		} else {
462			error_check_good clientdb_close [$cdb close] 0
463		}
464	}
465	error_check_good masterenv_close [$envs($master) close] 0
466	error_check_good clientenv_close [$envs($test_client) close] 0
467	error_check_good clientenv2_close [$envs($other) close] 0
468	if { $abandoned } {
469		catch {$abandoned_env close}
470	}
471	replclose $testdir/MSGQUEUEDIR
472}
473