1# See the file LICENSE for redistribution information.
2#
3# Copyright (c) 2004,2008 Oracle.  All rights reserved.
4#
5# $Id: rep039.tcl,v 1.31 2008/04/10 17:19:47 carol Exp $
6#
7# TEST	rep039
8# TEST	Test of interrupted internal initialization changes.  The
9# TEST	interruption is due to a changed master, or the client crashing,
10# TEST	or both.
11# TEST
12# TEST	One master, two clients.
13# TEST	Generate several log files. Remove old master log files.
14# TEST	Restart client, optionally having "cleaned" client env dir.  Either
15# TEST	way, this has the effect of forcing an internal init.
16# TEST	Interrupt the internal init.
17# TEST	Vary the number of times we process messages to make sure
18# TEST	the interruption occurs at varying stages of the first internal
19# TEST	initialization.
20# TEST
21# TEST	Run for btree and queue only because of the number of permutations.
22# TEST
23proc rep039 { method { niter 200 } { tnum "039" } args } {
24
25	source ./include.tcl
26
27	# Run for btree and queue methods only.
28	if { $checking_valid_methods } {
29		set test_methods {}
30		foreach method $valid_methods {
31			if { [is_btree $method] == 1 || \
32			    [is_queue $method] == 1 } {
33				lappend test_methods $method
34			}
35		}
36		return $test_methods
37	}
38	if { [is_btree $method] == 0 && [is_queue $method] == 0 } {
39		puts "Rep$tnum: skipping for non-btree, non-queue method."
40		return
41	}
42
43	# Skip for mixed-mode logging -- this test has a very large
44	# set of iterations already.
45	global mixed_mode_logging
46	if { $mixed_mode_logging > 0 } {
47		puts "Rep$tnum: Skipping for mixed mode logging."
48		return
49	}
50
51	# This test needs to set its own pagesize.
52	set pgindex [lsearch -exact $args "-pagesize"]
53	if { $pgindex != -1 } {
54		puts "Rep$tnum: skipping for specific pagesizes"
55		return
56	}
57
58	set args [convert_args $method $args]
59
60	# Run the body of the test with and without recovery,
61	# and with and without cleaning.
62	set cleanopts { noclean clean }
63	set archopts { archive noarchive }
64	set nummsgs 4
65	set announce {puts "Rep$tnum ($method $r $clean $a $crash $l $args):\
66            Test of internal init. $i message iters. \
67	    Test $cnt of $maxtest tests $with recovery."}
68	foreach r $test_recopts {
69		if { $r == "-recover" && ! $is_windows_test && ! $is_hp_test } {
70			set crashopts { master_change client_crash both }
71		} else {
72			set crashopts { master_change }
73		}
74		# Only one of the three sites in the replication group needs to
75		# be tested with in-memory logs: the "client under test".
76		#
77		if { $r == "-recover" } {
78			set cl_logopts { on-disk }
79			set with "with"
80		} else {
81			set cl_logopts { on-disk in-memory }
82			set with "without"
83		}
84		set maxtest [expr [llength $crashopts] * \
85		    [llength $cleanopts] * \
86		    [llength $archopts] * \
87		    [llength $cl_logopts] * \
88		    [expr $nummsgs]]
89		set cnt 1
90		foreach crash $crashopts {
91			foreach clean $cleanopts {
92				foreach a $archopts {
93					foreach l $cl_logopts {
94						for { set i 1 } \
95						    { $i <= $nummsgs } \
96						    { incr i } {
97							eval $announce
98							rep039_sub $method \
99							    $niter $tnum $r \
100							    $clean $a $crash \
101							    $l $i $args
102							incr cnt
103						}
104					}
105				}
106			}
107		}
108	}
109}
110
111proc rep039_sub \
112    { method niter tnum recargs clean archive crash cl_logopt pmsgs largs } {
113	global testdir
114	global util_path
115	global rep_verbose
116	global verbose_type
117
118	set verbargs ""
119	if { $rep_verbose == 1 } {
120		set verbargs " -verbose {$verbose_type on} "
121	}
122
123	set master_change false
124	set client_crash false
125	if { $crash == "master_change" } {
126		set master_change true
127	} elseif { $crash == "client_crash" } {
128		set client_crash true
129	} elseif { $crash == "both" } {
130		set master_change true
131		set client_crash true
132	} else {
133		error "FAIL:[timestamp] '$crash' is an unrecognized crash type"
134	}
135
136	env_cleanup $testdir
137
138	replsetup $testdir/MSGQUEUEDIR
139
140	# This test has three replication sites: a master, a client whose
141	# behavior is under test, and another client.  We'll call them
142	# "A", "B" and "C".  At one point during the test, we may (depending on
143	# the setting of $master_change) switch roles between the master and the
144	# other client.
145	#
146	# The initial site/role assignments are as follows:
147	#
148	#     A = master
149	#     B = client under test
150	#     C = other client
151	#
152	# In the case where we do switch roles, the roles become:
153	#
154	#     A = other client
155	#     B = client under test (no change here)
156	#     C = master
157	#
158	# Although the real names are A, B, and C, we'll use mnemonic names
159	# whenever possible.  In particular, this means that we'll have to
160	# re-jigger the mnemonic names after the role switch.
161
162	file mkdir [set dirs(A) $testdir/SITE_A]
163	file mkdir [set dirs(B) $testdir/SITE_B]
164	file mkdir [set dirs(C) $testdir/SITE_C]
165
166	# Log size is small so we quickly create more than one.
167	# The documentation says that the log file must be at least
168	# four times the size of the in-memory log buffer.
169	set pagesize 4096
170	append largs " -pagesize $pagesize "
171	set log_buf [expr $pagesize * 2]
172	set log_max [expr $log_buf * 4]
173
174	# Set up the three sites: A, B, and C will correspond to EID's
175	# 1, 2, and 3 in the obvious way.  As we start out, site A is always the
176	# master.
177	#
178	repladd 1
179	set env_A_cmd "berkdb_env_noerr -create -txn nosync $verbargs \
180	    -log_buffer $log_buf -log_max $log_max -errpfx SITE_A \
181	    -home $dirs(A) -rep_transport \[list 1 replsend\]"
182	set envs(A) [eval $env_A_cmd $recargs -rep_master]
183
184	# Open a client
185	repladd 2
186	set txn_arg [adjust_txnargs $cl_logopt]
187	set log_arg [adjust_logargs $cl_logopt]
188        if { $cl_logopt == "on-disk" } {
189		# Override in this case, because we want to specify log_buffer.
190		set log_arg "-log_buffer $log_buf"
191	}
192	set env_B_cmd "berkdb_env_noerr -create $txn_arg $verbargs \
193	    $log_arg -log_max $log_max -errpfx SITE_B \
194	    -home $dirs(B) -rep_transport \[list 2 replsend\]"
195	set envs(B) [eval $env_B_cmd $recargs -rep_client]
196
197	# Open 2nd client
198	repladd 3
199	set env_C_cmd "berkdb_env_noerr -create -txn nosync $verbargs \
200	    -log_buffer $log_buf -log_max $log_max -errpfx SITE_C \
201	    -home $dirs(C) -rep_transport \[list 3 replsend\]"
202	set envs(C) [eval $env_C_cmd $recargs -rep_client]
203
204	# Turn off throttling for this test.
205	foreach site [array names envs] {
206		$envs($site) rep_limit 0 0
207	}
208
209	# Bring the clients online by processing the startup messages.
210	set envlist "{$envs(A) 1} {$envs(B) 2} {$envs(C) 3}"
211	process_msgs $envlist
212
213	# Set up the (indirect) mnemonic role names for the first part of the
214	# test.
215	set master A
216	set test_client B
217	set other C
218
219	# Clobber replication's 30-second anti-archive timer, which will have
220	# been started by client sync-up internal init, so that we can do a
221	# log_archive in a moment.
222	#
223	$envs($master) test force noarchive_timeout
224
225	# Run rep_test in the master (and update client).
226	puts "\tRep$tnum.a: Running rep_test in replicated env."
227	eval rep_test $method $envs($master) NULL $niter 0 0 0 0 $largs
228	process_msgs $envlist
229
230	puts "\tRep$tnum.b: Close client."
231	error_check_good client_close [$envs($test_client) close] 0
232
233	set res [eval exec $util_path/db_archive -l -h $dirs($test_client)]
234	set last_client_log [lindex [lsort $res] end]
235
236	set stop 0
237	while { $stop == 0 } {
238		# Run rep_test in the master (don't update client).
239		puts "\tRep$tnum.c: Running rep_test in replicated env."
240		eval rep_test $method $envs($master) NULL $niter 0 0 0 0 $largs
241		#
242		# Clear messages for first client.  We want that site
243		# to get far behind.
244		#
245		replclear 2
246		puts "\tRep$tnum.d: Run db_archive on master."
247		set res [eval exec $util_path/db_archive -d -h $dirs($master)]
248		set res [eval exec $util_path/db_archive -l -h $dirs($master)]
249		if { [lsearch -exact $res $last_client_log] == -1 } {
250			set stop 1
251		}
252	}
253
254	set envlist "{$envs($master) 1} {$envs($other) 3}"
255	process_msgs $envlist
256
257	if { $archive == "archive" } {
258		puts "\tRep$tnum.d: Run db_archive on other client."
259		set res [eval exec $util_path/db_archive -l -h $dirs($other)]
260		error_check_bad \
261		    log.1.present [lsearch -exact $res log.0000000001] -1
262		set res [eval exec $util_path/db_archive -d -h $dirs($other)]
263		set res [eval exec $util_path/db_archive -l -h $dirs($other)]
264		error_check_good \
265		    log.1.gone [lsearch -exact $res log.0000000001] -1
266	} else {
267		puts "\tRep$tnum.d: Skipping db_archive on other client."
268	}
269
270	puts "\tRep$tnum.e: Reopen test client ($clean)."
271	if { $clean == "clean" } {
272		env_cleanup $dirs($test_client)
273	}
274
275	# (The test client is always site B, EID 2.)
276	#
277	set envs(B) [eval $env_B_cmd $recargs -rep_client]
278	error_check_good client_env [is_valid_env $envs(B)] TRUE
279	$envs(B) rep_limit 0 0
280
281	# Hold an open database handle while doing internal init, to make sure
282	# no back lock interactions are happening.  But only do so some of the
283	# time.
284	#
285	if {$clean == "noclean" && [berkdb random_int 0 1] == 1} {
286		puts "\tRep$tnum.g: Hold open db handle from client app."
287		set cdb [eval {berkdb_open_noerr -env} $envs($test_client) "test.db"]
288		error_check_good dbopen [is_valid_db $cdb] TRUE
289		set ccur [$cdb cursor]
290		error_check_good curs [is_valid_cursor $ccur $cdb] TRUE
291		set ret [$ccur get -first]
292		set kd [lindex $ret 0]
293		set key [lindex $kd 0]
294		error_check_good cclose [$ccur close] 0
295	} else {
296		puts "\tRep$tnum.g: (No client app handle will be held.)"
297		set cdb "NONE"
298	}
299
300	set envlist "{$envs(A) 1} {$envs(B) 2} {$envs(C) 3}"
301	proc_msgs_once $envlist
302
303	#
304	# We want to simulate a master continually getting new
305	# records while an update is going on.
306	#
307	set entries 10
308	eval rep_test $method $envs($master) NULL $entries $niter 0 0 0 $largs
309	#
310	# We call proc_msgs_once N times to get us into page recovery:
311	# 1.  Send master messages and client finds master.
312	# 2.  Master replies and client does verify.
313	# 3.  Master gives verify_fail and client does update_req.
314	# 4.  Master send update info and client does page_req.
315	#
316	# We vary the number of times we call proc_msgs_once (via pmsgs)
317	# so that we test switching master at each point in the
318	# internal initialization processing.
319	#
320	set nproced 0
321	puts "\tRep$tnum.f: Get partially through initialization ($pmsgs iters)"
322	for { set i 1 } { $i < $pmsgs } { incr i } {
323		incr nproced [proc_msgs_once $envlist]
324	}
325
326	if { [string is true $master_change] } {
327		replclear 1
328		replclear 3
329		puts "\tRep$tnum.g: Downgrade/upgrade master."
330
331		# Downgrade the existing master to a client, switch around the
332		# roles, and then upgrade the newly appointed master.
333		error_check_good downgrade [$envs($master) rep_start -client] 0
334
335		set master C
336		set other A
337
338		error_check_good upgrade [$envs($master) rep_start -master] 0
339	}
340
341	# Simulate a client crash: simply abandon the handle without closing it.
342	# Note that this doesn't work on Windows, because there you can't remove
343	# a file if anyone (including yourself) has it open.  This also does not
344	# work on HP-UX, because there you are not allowed to open a second
345	# handle on an env.
346	#
347	# Note that crashing only makes sense with "-recover".
348	#
349	if { [string is true $client_crash] } {
350		error_check_good assert [string compare $recargs "-recover"] 0
351
352		set abandoned_env $envs($test_client)
353		set abandoned true
354
355		set envs($test_client) [eval $env_B_cmd $recargs -rep_client]
356		$envs($test_client) rep_limit 0 0
357
358		# Again, remember: whatever the current roles, a site and its EID
359		# stay linked always.
360		#
361		set envlist "{$envs(A) 1} {$envs(B) 2} {$envs(C) 3}"
362	} else {
363		set abandoned false
364	}
365
366	process_msgs $envlist
367	#
368	# Now simulate continual updates to the new master.  Each
369	# time through we just process messages once before
370	# generating more updates.
371	#
372	set niter 10
373	for { set i 0 } { $i < $niter } { incr i } {
374		set nproced 0
375		set start [expr $i * $entries]
376		eval rep_test $method $envs($master) NULL $entries $start \
377		    $start 0 0 $largs
378		incr nproced [proc_msgs_once $envlist]
379		error_check_bad nproced $nproced 0
380	}
381	set start [expr $i * $entries]
382	process_msgs $envlist
383
384	puts "\tRep$tnum.h: Verify logs and databases"
385	# Whether or not we've switched roles, it's always site A that may have
386	# had its logs archived away.  When the $init_test flag is turned on,
387	# rep_verify allows the site in the second position to have
388	# (more-)archived logs, so we have to abuse the calling signature a bit
389	# here to get this to work.  (I.e., even when A is still master and C is
390	# still the other client, we have to pass things in this order so that
391	# the $init_test different-sized-logs trick can work.)
392	#
393	set init_test 1
394	rep_verify $dirs(C) $envs(C) $dirs(A) $envs(A) $init_test
395
396	# Process messages again in case we are running with debug_rop.
397	process_msgs $envlist
398	rep_verify $dirs($master) $envs($master) \
399	    $dirs($test_client) $envs($test_client) $init_test
400
401	# Add records to the master and update client.
402	puts "\tRep$tnum.i: Add more records and check again."
403	set entries 10
404	eval rep_test $method $envs($master) NULL $entries $start \
405	    $start 0 0 $largs
406	process_msgs $envlist 0 NONE err
407
408	# Check again that everyone is identical.
409	rep_verify $dirs(C) $envs(C) $dirs(A) $envs(A) $init_test
410	process_msgs $envlist
411	rep_verify $dirs($master) $envs($master) \
412	    $dirs($test_client) $envs($test_client) $init_test
413
414	if {$cdb != "NONE"} {
415		if {$abandoned} {
416			# The $cdb was opened in an env which was then
417			# abandoned, recovered, marked panic'ed.  We don't
418			# really care; we're just trying to clean up resources.
419			#
420			catch {$cdb close}
421		} else {
422			error_check_good clientdb_close [$cdb close] 0
423		}
424	}
425	error_check_good masterenv_close [$envs($master) close] 0
426	error_check_good clientenv_close [$envs($test_client) close] 0
427	error_check_good clientenv2_close [$envs($other) close] 0
428	if { $abandoned } {
429		catch {$abandoned_env close}
430	}
431	replclose $testdir/MSGQUEUEDIR
432}
433