1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2007,2008 Oracle.  All rights reserved.
5 *
6 * $Id: rep_lease.c,v 12.23 2008/01/11 21:49:26 sue Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/log.h"
13
14static void __rep_find_entry __P((ENV *, REP *, int, REP_LEASE_ENTRY **));
15
16/*
17 * __rep_update_grant -
18 *      Update a client's lease grant for this perm record
19 *	and send the grant to the master.  Caller must
20 *	hold the mtx_clientdb mutex.  Timespec given is in
21 *	host local format.
22 *
23 * PUBLIC: int __rep_update_grant __P((ENV *, db_timespec *));
24 */
25int
26__rep_update_grant(env, ts)
27	ENV *env;
28	db_timespec *ts;
29{
30	DBT lease_dbt;
31	DB_LOG *dblp;
32	DB_REP *db_rep;
33	LOG *lp;
34	REP *rep;
35	__rep_grant_info_args gi;
36	db_timespec mytime;
37	u_int8_t buf[__REP_GRANT_INFO_SIZE];
38	int ret;
39	size_t len;
40
41	db_rep = env->rep_handle;
42	rep = db_rep->region;
43	dblp = env->lg_handle;
44	lp = dblp->reginfo.primary;
45	timespecclear(&mytime);
46
47	/*
48	 * Get current time, and add in the (skewed) lease duration
49	 * time to send the grant to the master.
50	 */
51	__os_gettime(env, &mytime, 1);
52	timespecadd(&mytime, &rep->lease_duration);
53	REP_SYSTEM_LOCK(env);
54	/*
55	 * If we are in an election, we cannot grant the lease.
56	 * We need to check under the region mutex.
57	 */
58	if (IN_ELECTION(rep)) {
59		REP_SYSTEM_UNLOCK(env);
60		return (0);
61	}
62	if (timespeccmp(&mytime, &rep->grant_expire, >))
63		rep->grant_expire = mytime;
64	REP_SYSTEM_UNLOCK(env);
65
66	/*
67	 * Send the LEASE_GRANT message with the current lease grant
68	 * no matter if we've actually extended the lease or not.
69	 */
70	gi.msg_sec = (u_int32_t)ts->tv_sec;
71	gi.msg_nsec = (u_int32_t)ts->tv_nsec;
72
73	if ((ret = __rep_grant_info_marshal(env, &gi, buf,
74	    __REP_GRANT_INFO_SIZE, &len)) != 0)
75		return (ret);
76	DB_INIT_DBT(lease_dbt, buf, len);
77	(void)__rep_send_message(env, rep->master_id, REP_LEASE_GRANT,
78	    &lp->max_perm_lsn, &lease_dbt, 0, 0);
79	return (0);
80}
81
82/*
83 * __rep_islease_granted -
84 *      Return 0 if this client has no outstanding lease granted.
85 *	Return 1 otherwise.
86 *	Caller must hold the REP_SYSTEM (region) mutex.
87 *
88 * PUBLIC: int __rep_islease_granted __P((ENV *));
89 */
90int
91__rep_islease_granted(env)
92	ENV *env;
93{
94	DB_REP *db_rep;
95	REP *rep;
96	db_timespec mytime;
97
98	db_rep = env->rep_handle;
99	rep = db_rep->region;
100	/*
101	 * Get current time and compare against our granted lease.
102	 */
103	timespecclear(&mytime);
104	__os_gettime(env, &mytime, 1);
105
106	return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0);
107}
108
109/*
110 * __rep_lease_table_alloc -
111 *	Allocate the lease table on a master.  Called with rep mutex
112 * held.  We need to acquire the env region mutex, so we need to
113 * make sure we never acquire those mutexes in the opposite order.
114 *
115 * PUBLIC: int __rep_lease_table_alloc __P((ENV *, u_int32_t));
116 */
117int
118__rep_lease_table_alloc(env, nsites)
119	ENV *env;
120	u_int32_t nsites;
121{
122	REGENV *renv;
123	REGINFO *infop;
124	REP *rep;
125	REP_LEASE_ENTRY *le, *table;
126	int *lease, ret;
127	u_int32_t i;
128
129	rep = env->rep_handle->region;
130
131	infop = env->reginfo;
132	renv = infop->primary;
133	MUTEX_LOCK(env, renv->mtx_regenv);
134	if ((ret = __env_alloc(infop, (size_t)nsites * sizeof(REP_LEASE_ENTRY),
135	    &lease)) == 0) {
136		if (rep->lease_off != INVALID_ROFF)
137			__env_alloc_free(infop,
138			    R_ADDR(infop, rep->lease_off));
139		rep->lease_off = R_OFFSET(infop, lease);
140	}
141	MUTEX_UNLOCK(env, renv->mtx_regenv);
142	table = R_ADDR(infop, rep->lease_off);
143	for (i = 0; i < nsites; i++) {
144		le = &table[i];
145		le->eid = DB_EID_INVALID;
146		timespecclear(&le->start_time);
147		timespecclear(&le->end_time);
148		ZERO_LSN(le->lease_lsn);
149	}
150	return (ret);
151}
152
153/*
154 * __rep_lease_grant -
155 *	Handle incoming REP_LEASE_GRANT message on a master.
156 *
157 * PUBLIC: int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int));
158 */
159int
160__rep_lease_grant(env, rp, rec, eid)
161	ENV *env;
162	__rep_control_args *rp;
163	DBT *rec;
164	int eid;
165{
166	DB_REP *db_rep;
167	REP *rep;
168	__rep_grant_info_args gi;
169	REP_LEASE_ENTRY *le;
170	db_timespec msg_time;
171	int ret;
172
173	db_rep = env->rep_handle;
174	rep = db_rep->region;
175	if ((ret = __rep_grant_info_unmarshal(env,
176	    &gi, rec->data, rec->size, NULL)) != 0)
177		return (ret);
178	timespecset(&msg_time, gi.msg_sec, gi.msg_nsec);
179	le = NULL;
180
181	/*
182	 * Get current time, and add in the (skewed) lease duration
183	 * time to send the grant to the master.
184	 */
185	REP_SYSTEM_LOCK(env);
186	__rep_find_entry(env, rep, eid, &le);
187	/*
188	 * We either get back this site's entry, or an empty entry
189	 * that we need to initialize.
190	 */
191	DB_ASSERT(env, le != NULL);
192	/*
193	 * Update the entry if it is an empty entry or if the new
194	 * lease grant is a later start time than the current one.
195	 */
196	RPRINT(env, DB_VERB_REP_LEASE,
197	    (env, "lease_grant: grant msg time %lu %lu",
198	    (u_long)msg_time.tv_sec, (u_long)msg_time.tv_nsec));
199	if (le->eid == DB_EID_INVALID ||
200	    timespeccmp(&msg_time, &le->start_time, >)) {
201		le->eid = eid;
202		le->start_time = msg_time;
203		le->end_time = le->start_time;
204		timespecadd(&le->end_time, &rep->lease_duration);
205		RPRINT(env, DB_VERB_REP_LEASE, (env,
206    "lease_grant: eid %d, start %lu %lu, end %lu %lu, duration %lu %lu",
207    le->eid, (u_long)le->start_time.tv_sec, (u_long)le->start_time.tv_nsec,
208    (u_long)le->end_time.tv_sec, (u_long)le->end_time.tv_nsec,
209    (u_long)rep->lease_duration.tv_sec, (u_long)rep->lease_duration.tv_nsec));
210		/*
211		 * XXX Is this really true?  Could we have a lagging
212		 * record that has a later start time, but smaller
213		 * LSN than we have previously seen??
214		 */
215		DB_ASSERT(env, LOG_COMPARE(&rp->lsn, &le->lease_lsn) >= 0);
216		le->lease_lsn = rp->lsn;
217	}
218	REP_SYSTEM_UNLOCK(env);
219	return (0);
220}
221
222/*
223 * Find the entry for the given EID.  Or the first empty one.
224 */
225static void
226__rep_find_entry(env, rep, eid, lep)
227	ENV *env;
228	REP *rep;
229	int eid;
230	REP_LEASE_ENTRY **lep;
231{
232	REGINFO *infop;
233	REP_LEASE_ENTRY *le, *table;
234	u_int32_t i;
235
236	infop = env->reginfo;
237	table = R_ADDR(infop, rep->lease_off);
238
239	for (i = 0; i < rep->nsites; i++) {
240		le = &table[i];
241		/*
242		 * Find either the one that matches the client's
243		 * EID or the first empty one.
244		 */
245		if (le->eid == eid || le->eid == DB_EID_INVALID) {
246			*lep = le;
247			return;
248		}
249	}
250	return;
251}
252
253/*
254 * __rep_lease_check -
255 *      Return 0 if this master holds valid leases and can confirm
256 *	its mastership.  If leases are expired, an attempt is made
257 *	to refresh the leases.  If that fails, then return the
258 *	DB_REP_LEASE_EXPIRED error to the user.  No mutexes held.
259 *
260 * PUBLIC: int __rep_lease_check __P((ENV *, int));
261 */
262int
263__rep_lease_check(env, refresh)
264	ENV *env;
265	int refresh;
266{
267	DB_LOG *dblp;
268	DB_LSN lease_lsn;
269	DB_REP *db_rep;
270	LOG *lp;
271	REGINFO *infop;
272	REP *rep;
273	REP_LEASE_ENTRY *le, *table;
274	db_timespec curtime;
275	int ret, tries;
276	u_int32_t i, min_leases, valid_leases;
277
278	infop = env->reginfo;
279	tries = 0;
280retry:
281	ret = 0;
282	db_rep = env->rep_handle;
283	rep = db_rep->region;
284	dblp = env->lg_handle;
285	lp = dblp->reginfo.primary;
286	LOG_SYSTEM_LOCK(env);
287	lease_lsn = lp->max_perm_lsn;
288	LOG_SYSTEM_UNLOCK(env);
289	REP_SYSTEM_LOCK(env);
290	min_leases = rep->nsites / 2;
291
292	__os_gettime(env, &curtime, 1);
293	RPRINT(env, DB_VERB_REP_LEASE,
294	    (env, "lease_check: min_leases %lu curtime %lu %lu",
295	    (u_long)min_leases, (u_long)curtime.tv_sec,
296	    (u_long)curtime.tv_nsec));
297	table = R_ADDR(infop, rep->lease_off);
298	for (i = 0, valid_leases = 0;
299	    i < rep->nsites && valid_leases < min_leases; i++) {
300		le = &table[i];
301		/*
302		 * Count this lease as valid if:
303		 * - It is a valid entry (has an EID).
304		 * - The lease has not expired.
305		 * - The LSN is up to date.
306		 */
307		if (le->eid != DB_EID_INVALID) {
308			RPRINT(env, DB_VERB_REP_LEASE, (env,
309		    "lease_check: valid %lu eid %d, lease_lsn [%lu][%lu]",
310			    (u_long)valid_leases, le->eid,
311			    (u_long)le->lease_lsn.file,
312			    (u_long)le->lease_lsn.offset));
313			RPRINT(env, DB_VERB_REP_LEASE,
314			    (env, "lease_check: endtime %lu %lu",
315			    (u_long)le->end_time.tv_sec,
316			    (u_long)le->end_time.tv_nsec));
317		}
318		if (le->eid != DB_EID_INVALID &&
319		    timespeccmp(&le->end_time, &curtime, >=) &&
320		    LOG_COMPARE(&le->lease_lsn, &lease_lsn) == 0)
321			valid_leases++;
322	}
323	REP_SYSTEM_UNLOCK(env);
324
325	/*
326	 * Now see if we have enough.
327	 */
328	RPRINT(env, DB_VERB_REP_LEASE, (env, "valid %lu, min %lu",
329	    (u_long)valid_leases, (u_long)min_leases));
330	if (valid_leases < min_leases) {
331		if (!refresh)
332			ret = DB_REP_LEASE_EXPIRED;
333		else {
334			/*
335			 * If we are successful, we need to recheck the leases
336			 * because the lease grant messages may have raced with
337			 * the PERM acknowledgement.  Give the grant messages
338			 * a chance to arrive and be processed.
339			 */
340			if ((ret = __rep_lease_refresh(env)) == 0) {
341				if (tries <= LEASE_REFRESH_TRIES) {
342					/*
343					 * If we were successful sending, but
344					 * not in racing the message threads,
345					 * then yield the processor so that
346					 * the message threads get a chance
347					 * to run.
348					 */
349					if (tries > 0)
350						__os_yield(env, 1, 0);
351					tries++;
352					goto retry;
353				} else
354					ret = DB_REP_LEASE_EXPIRED;
355			}
356		}
357	}
358
359	return (ret);
360}
361
362/*
363 * __rep_lease_refresh -
364 *	Find the last permanent record and send that out so that it
365 *	forces clients to grant their leases.
366 *
367 * PUBLIC: int __rep_lease_refresh __P((ENV *));
368 */
369int
370__rep_lease_refresh(env)
371	ENV *env;
372{
373	DBT rec;
374	DB_LOGC *logc;
375	DB_LSN lsn;
376	DB_REP *db_rep;
377	REP *rep;
378	int ret, t_ret;
379
380	db_rep = env->rep_handle;
381	rep = db_rep->region;
382
383	if ((ret = __log_cursor(env, &logc)) != 0)
384		return (ret);
385
386	memset(&rec, 0, sizeof(rec));
387	memset(&lsn, 0, sizeof(lsn));
388	/*
389	 * Use __rep_log_backup to find the last PERM record.
390	 */
391	if ((ret = __rep_log_backup(env, rep, logc, &lsn)) != 0)
392		goto err;
393
394	if ((ret = __logc_get(logc, &lsn, &rec, DB_CURRENT)) != 0)
395		goto err;
396
397	if ((ret = __rep_send_message(env,
398	    DB_EID_BROADCAST, REP_LOG, &lsn, &rec, REPCTL_PERM, 0)) != 0) {
399		/*
400		 * If we do not get an ack, we expire leases.
401		 */
402		(void)__rep_lease_expire(env, 0);
403		ret = DB_REP_LEASE_EXPIRED;
404	}
405
406err:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
407		ret = t_ret;
408	return (ret);
409}
410
411/*
412 * __rep_lease_expire -
413 *	Proactively expire all leases granted to us.
414 *
415 * PUBLIC: int __rep_lease_expire __P((ENV *, int));
416 */
417int
418__rep_lease_expire(env, locked)
419	ENV *env;
420	int locked;
421{
422	DB_REP *db_rep;
423	REGINFO *infop;
424	REP *rep;
425	REP_LEASE_ENTRY *le, *table;
426	int ret;
427	u_int32_t i;
428
429	ret = 0;
430	db_rep = env->rep_handle;
431	rep = db_rep->region;
432	infop = env->reginfo;
433
434	if (!locked)
435		REP_SYSTEM_LOCK(env);
436	if (rep->lease_off != INVALID_ROFF) {
437		table = R_ADDR(infop, rep->lease_off);
438		/*
439		 * Expire all leases forcibly.  We are guaranteed that the
440		 * start_time for all leases are not in the future.  Therefore,
441		 * set the end_time to the start_time.
442		 */
443		for (i = 0; i < rep->nsites; i++) {
444			le = &table[i];
445			le->end_time = le->start_time;
446		}
447	}
448	if (!locked)
449		REP_SYSTEM_UNLOCK(env);
450	return (ret);
451}
452
453/*
454 * __rep_lease_waittime -
455 *	Return the amount of time remaining on a granted lease.
456 * Assume the caller holds the REP_SYSTEM (region) mutex.
457 *
458 * PUBLIC: db_timeout_t __rep_lease_waittime __P((ENV *));
459 */
460db_timeout_t
461__rep_lease_waittime(env)
462	ENV *env;
463{
464	DB_REP *db_rep;
465	REP *rep;
466	db_timespec exptime, mytime;
467	db_timeout_t to;
468
469	db_rep = env->rep_handle;
470	rep = db_rep->region;
471	exptime = rep->grant_expire;
472	to = 0;
473	/*
474	 * If the lease has never been granted, we must wait a full
475	 * lease timeout because we could be freshly rebooted after
476	 * a crash and a lease could be granted from a previous
477	 * incarnation of this client.
478	 */
479	RPRINT(env, DB_VERB_REP_LEASE, (env,
480    "wait_time: grant_expire %lu %lu lease_to %lu",
481	    (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec,
482	    (u_long)rep->lease_timeout));
483	if (!timespecisset(&exptime))
484		to = rep->lease_timeout;
485	else {
486		__os_gettime(env, &mytime, 1);
487		RPRINT(env, DB_VERB_REP_LEASE, (env,
488    "wait_time: mytime %lu %lu, grant_expire %lu %lu",
489		    (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec,
490		    (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec));
491		if (timespeccmp(&mytime, &exptime, <=)) {
492			/*
493			 * If the current time is before the grant expiration
494			 * compute the difference and return remaining grant
495			 * time.
496			 */
497			timespecsub(&exptime, &mytime);
498			DB_TIMESPEC_TO_TIMEOUT(to, &exptime, 1);
499		}
500	}
501	return (to);
502}
503