1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2001,2008 Oracle.  All rights reserved.
5 *
6 * $Id: rep_region.c,v 12.55 2008/01/11 20:50:03 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/db_am.h"
14#include "dbinc/log.h"
15
16static int __rep_egen_init  __P((ENV *, REP *));
17static int __rep_gen_init  __P((ENV *, REP *));
18
19/*
20 * __rep_open --
21 *	Initialize the shared memory state for the replication system.
22 *
23 * PUBLIC: int __rep_open __P((ENV *));
24 */
25int
26__rep_open(env)
27	ENV *env;
28{
29	DB_REP *db_rep;
30	REGENV *renv;
31	REGINFO *infop;
32	REP *rep;
33	int ret;
34
35	db_rep = env->rep_handle;
36	infop = env->reginfo;
37	renv = infop->primary;
38	ret = 0;
39
40	if (renv->rep_off == INVALID_ROFF) {
41		/* Must create the region. */
42		if ((ret = __env_alloc(infop, sizeof(REP), &rep)) != 0)
43			return (ret);
44		memset(rep, 0, sizeof(*rep));
45
46		/*
47		 * We have the region; fill in the values.  Some values may
48		 * have been configured before we open the region, and those
49		 * are taken from the DB_REP structure.
50		 */
51		if ((ret = __mutex_alloc(
52		    env, MTX_REP_REGION, 0, &rep->mtx_region)) != 0)
53			return (ret);
54		/*
55		 * Because we have no way to prevent deadlocks and cannot log
56		 * changes made to it, we single-thread access to the client
57		 * bookkeeping database.  This is suboptimal, but it only gets
58		 * accessed when messages arrive out-of-order, so it should
59		 * stay small and not be used in a high-performance app.
60		 */
61		if ((ret = __mutex_alloc(
62		    env, MTX_REP_DATABASE, 0, &rep->mtx_clientdb)) != 0)
63			return (ret);
64
65		if ((ret = __mutex_alloc(
66		    env, MTX_REP_CHKPT, 0, &rep->mtx_ckp)) != 0)
67			return (ret);
68
69		if ((ret = __mutex_alloc(
70		    env, MTX_REP_EVENT, 0, &rep->mtx_event)) != 0)
71			return (ret);
72
73		rep->newmaster_event_gen = 0;
74		rep->notified_egen = 0;
75		rep->lease_off = INVALID_ROFF;
76		rep->tally_off = INVALID_ROFF;
77		rep->v2tally_off = INVALID_ROFF;
78		rep->eid = db_rep->eid;
79		rep->master_id = DB_EID_INVALID;
80		rep->gen = 0;
81		rep->version = DB_REPVERSION;
82		if ((ret = __rep_gen_init(env, rep)) != 0)
83			return (ret);
84		if ((ret = __rep_egen_init(env, rep)) != 0)
85			return (ret);
86		rep->gbytes = db_rep->gbytes;
87		rep->bytes = db_rep->bytes;
88		rep->request_gap = db_rep->request_gap;
89		rep->max_gap = db_rep->max_gap;
90		rep->config_nsites = db_rep->config_nsites;
91		rep->config = db_rep->config;
92		rep->elect_timeout = db_rep->elect_timeout;
93		rep->full_elect_timeout = db_rep->full_elect_timeout;
94		rep->lease_timeout = db_rep->lease_timeout;
95		rep->clock_skew = db_rep->clock_skew;
96		rep->clock_base = db_rep->clock_base;
97		timespecclear(&rep->lease_duration);
98		timespecclear(&rep->grant_expire);
99		rep->chkpt_delay = db_rep->chkpt_delay;
100		rep->priority = db_rep->my_priority;
101
102		F_SET(rep, REP_F_NOARCHIVE);
103
104		/* Initialize encapsulating region. */
105		renv->rep_off = R_OFFSET(infop, rep);
106		(void)time(&renv->rep_timestamp);
107		renv->op_timestamp = 0;
108		F_CLR(renv, DB_REGENV_REPLOCKED);
109	} else
110		rep = R_ADDR(infop, renv->rep_off);
111
112	db_rep->region = rep;
113
114	return (0);
115}
116
117/*
118 * __rep_env_refresh --
119 *	Replication-specific refresh of the ENV structure.
120 *
121 * PUBLIC: int __rep_env_refresh __P((ENV *));
122 */
123int
124__rep_env_refresh(env)
125	ENV *env;
126{
127	DB_REP *db_rep;
128	REGENV *renv;
129	REGINFO *infop;
130	REP *rep;
131	int ret, t_ret;
132
133	db_rep = env->rep_handle;
134	rep = db_rep->region;
135	infop = env->reginfo;
136	renv = infop->primary;
137	ret = 0;
138
139	/*
140	 * If we are the last reference closing the env, clear our knowledge of
141	 * belonging to a group and that there is a valid handle where
142	 * rep_start had already been called.
143	 */
144	if (renv->refcnt == 1) {
145		F_CLR(rep, REP_F_GROUP_ESTD);
146		F_CLR(rep, REP_F_START_CALLED);
147	}
148
149	/*
150	 * If a private region, return the memory to the heap.  Not needed for
151	 * filesystem-backed or system shared memory regions, that memory isn't
152	 * owned by any particular process.
153	 */
154	if (F_ISSET(env, ENV_PRIVATE)) {
155		db_rep = env->rep_handle;
156		if (db_rep->region != NULL) {
157			ret = __mutex_free(env, &db_rep->region->mtx_region);
158			if ((t_ret = __mutex_free(env,
159			    &db_rep->region->mtx_clientdb)) != 0 && ret == 0)
160				ret = t_ret;
161			if ((t_ret = __mutex_free(env,
162			    &db_rep->region->mtx_ckp)) != 0 && ret == 0)
163				ret = t_ret;
164			if ((t_ret = __mutex_free(env,
165			    &db_rep->region->mtx_event)) != 0 && ret == 0)
166				ret = t_ret;
167		}
168
169		if (renv->rep_off != INVALID_ROFF)
170			__env_alloc_free(infop, R_ADDR(infop, renv->rep_off));
171	}
172
173	env->rep_handle->region = NULL;
174	return (ret);
175}
176
177/*
178 * __rep_close --
179 *      Shut down all of replication.
180 *
181 * PUBLIC: int __rep_env_close __P((ENV *));
182 */
183int
184__rep_env_close(env)
185	ENV *env;
186{
187	int ret, t_ret;
188
189	ret = __rep_preclose(env);
190	if ((t_ret = __rep_closefiles(env, 0)) != 0 && ret == 0)
191		ret = t_ret;
192	return (ret);
193}
194
195/*
196 * __rep_preclose --
197 *	If we are a client, shut down our client database and send
198 * any outstanding bulk buffers.
199 *
200 * PUBLIC: int __rep_preclose __P((ENV *));
201 */
202int
203__rep_preclose(env)
204	ENV *env;
205{
206	DB_LOG *dblp;
207	DB_REP *db_rep;
208	LOG *lp;
209	REP_BULK bulk;
210	int ret;
211
212	ret = 0;
213
214	db_rep = env->rep_handle;
215	dblp = env->lg_handle;
216
217	/*
218	 * If we have a rep region, we can preclose.  Otherwise, return.
219	 * If we're on an error path from env open, we may not have
220	 * a region, even though we have a handle.
221	 */
222	if (db_rep == NULL || db_rep->region == NULL)
223		return (ret);
224	MUTEX_LOCK(env, db_rep->region->mtx_clientdb);
225	if (db_rep->rep_db != NULL) {
226		ret = __db_close(db_rep->rep_db, NULL, DB_NOSYNC);
227		db_rep->rep_db = NULL;
228	}
229	/*
230	 * We could be called early in an env_open error path, so
231	 * only do this if we have a log region set up.
232	 */
233	if (dblp == NULL)
234		goto out;
235	lp = dblp->reginfo.primary;
236	/*
237	 * If we have something in the bulk buffer, send anything in it
238	 * if we are able to.
239	 */
240	if (lp->bulk_off != 0 && db_rep->send != NULL) {
241		memset(&bulk, 0, sizeof(bulk));
242		bulk.addr = R_ADDR(&dblp->reginfo, lp->bulk_buf);
243		bulk.offp = &lp->bulk_off;
244		bulk.len = lp->bulk_len;
245		bulk.type = REP_BULK_LOG;
246		bulk.eid = DB_EID_BROADCAST;
247		bulk.flagsp = &lp->bulk_flags;
248		/*
249		 * Ignore send errors here.  This can be called on the
250		 * env->close path - make a best attempt to send.
251		 */
252		(void)__rep_send_bulk(env, &bulk, 0);
253	}
254out:	MUTEX_UNLOCK(env, db_rep->region->mtx_clientdb);
255	return (ret);
256}
257
258/*
259 * __rep_closefiles --
260 *	If we were a client and are now a master, close all databases
261 *	we've opened while applying messages as a client.  This can
262 *	be called from __env_close and we need to check if the env,
263 *	handles and regions are set up, or not.
264 *
265 * PUBLIC: int __rep_closefiles __P((ENV *, int));
266 */
267int
268__rep_closefiles(env, do_restored)
269	ENV *env;
270	int do_restored;
271{
272	DB_LOG *dblp;
273	DB_REP *db_rep;
274	int ret;
275
276	ret = 0;
277
278	db_rep = env->rep_handle;
279	dblp = env->lg_handle;
280
281	if (db_rep == NULL || db_rep->region == NULL)
282		return (ret);
283	if (dblp == NULL)
284		return (ret);
285	if ((ret = __dbreg_close_files(env, do_restored)) == 0)
286		F_CLR(db_rep, DBREP_OPENFILES);
287
288	return (ret);
289}
290
291/*
292 * __rep_egen_init --
293 *	Initialize the value of egen in the region.  Called only from
294 *	__rep_region_init, which is guaranteed to be single-threaded
295 *	as we create the rep region.  We set the rep->egen field which
296 *	is normally protected by db_rep->region->mutex.
297 */
298static int
299__rep_egen_init(env, rep)
300	ENV *env;
301	REP *rep;
302{
303	DB_FH *fhp;
304	int ret;
305	size_t cnt;
306	char *p;
307
308	if ((ret =
309	    __db_appname(env, DB_APP_NONE, REP_EGENNAME, 0, NULL, &p)) != 0)
310		return (ret);
311	/*
312	 * If the file doesn't exist, create it now and initialize with 1.
313	 */
314	if (__os_exists(env, p, NULL) != 0) {
315		rep->egen = rep->gen + 1;
316		if ((ret = __rep_write_egen(env, rep->egen)) != 0)
317			goto err;
318	} else {
319		/*
320		 * File exists, open it and read in our egen.
321		 */
322		if ((ret = __os_open(env, p, 0,
323		    DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0)
324			goto err;
325		if ((ret = __os_read(env, fhp, &rep->egen, sizeof(u_int32_t),
326		    &cnt)) != 0 || cnt != sizeof(u_int32_t))
327			goto err1;
328		RPRINT(env, DB_VERB_REP_MISC,
329		    (env, "Read in egen %lu", (u_long)rep->egen));
330err1:		 (void)__os_closehandle(env, fhp);
331	}
332err:	__os_free(env, p);
333	return (ret);
334}
335
336/*
337 * __rep_write_egen --
338 *	Write out the egen into the env file.
339 *
340 * PUBLIC: int __rep_write_egen __P((ENV *, u_int32_t));
341 */
342int
343__rep_write_egen(env, egen)
344	ENV *env;
345	u_int32_t egen;
346{
347	DB_FH *fhp;
348	int ret;
349	size_t cnt;
350	char *p;
351
352	if ((ret =
353	    __db_appname(env, DB_APP_NONE, REP_EGENNAME, 0, NULL, &p)) != 0)
354		return (ret);
355	if ((ret = __os_open(
356	    env, p, 0, DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) == 0) {
357		if ((ret = __os_write(env, fhp, &egen, sizeof(u_int32_t),
358		    &cnt)) != 0 || ((ret = __os_fsync(env, fhp)) != 0))
359			__db_err(env, ret, "%s", p);
360		(void)__os_closehandle(env, fhp);
361	}
362	__os_free(env, p);
363	return (ret);
364}
365
366/*
367 * __rep_gen_init --
368 *	Initialize the value of gen in the region.  Called only from
369 *	__rep_region_init, which is guaranteed to be single-threaded
370 *	as we create the rep region.  We set the rep->gen field which
371 *	is normally protected by db_rep->region->mutex.
372 */
373static int
374__rep_gen_init(env, rep)
375	ENV *env;
376	REP *rep;
377{
378	DB_FH *fhp;
379	int ret;
380	size_t cnt;
381	char *p;
382
383	if ((ret =
384	    __db_appname(env, DB_APP_NONE, REP_GENNAME, 0, NULL, &p)) != 0)
385		return (ret);
386	/*
387	 * If the file doesn't exist, create it now and initialize with 0.
388	 */
389	if (__os_exists(env, p, NULL) != 0) {
390		rep->gen = 0;
391		if ((ret = __rep_write_gen(env, rep->gen)) != 0)
392			goto err;
393	} else {
394		/*
395		 * File exists, open it and read in our gen.
396		 */
397		if ((ret = __os_open(env, p, 0,
398		    DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0)
399			goto err;
400		if ((ret = __os_read(env, fhp, &rep->gen, sizeof(u_int32_t),
401		    &cnt)) < 0 || cnt == 0)
402			goto err1;
403		RPRINT(env, DB_VERB_REP_MISC, (env, "Read in gen %lu",
404		    (u_long)rep->gen));
405err1:		 (void)__os_closehandle(env, fhp);
406	}
407err:	__os_free(env, p);
408	return (ret);
409}
410
411/*
412 * __rep_write_gen --
413 *	Write out the gen into the env file.
414 *
415 * PUBLIC: int __rep_write_gen __P((ENV *, u_int32_t));
416 */
417int
418__rep_write_gen(env, gen)
419	ENV *env;
420	u_int32_t gen;
421{
422	DB_FH *fhp;
423	int ret;
424	size_t cnt;
425	char *p;
426
427	if ((ret =
428	    __db_appname(env, DB_APP_NONE, REP_GENNAME, 0, NULL, &p)) != 0)
429		return (ret);
430	if ((ret = __os_open(
431	    env, p, 0, DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) == 0) {
432		if ((ret = __os_write(env, fhp, &gen, sizeof(u_int32_t),
433		    &cnt)) != 0 || ((ret = __os_fsync(env, fhp)) != 0))
434			__db_err(env, ret, "%s", p);
435		(void)__os_closehandle(env, fhp);
436	}
437	__os_free(env, p);
438	return (ret);
439}
440