1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2007-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/log.h"
13
14static void __rep_find_entry __P((ENV *, REP *, int, REP_LEASE_ENTRY **));
15
16/*
17 * __rep_update_grant -
18 *      Update a client's lease grant for this perm record
19 *	and send the grant to the master.  Caller must
20 *	hold the mtx_clientdb mutex.  Timespec given is in
21 *	host local format.
22 *
23 * PUBLIC: int __rep_update_grant __P((ENV *, db_timespec *));
24 */
25int
26__rep_update_grant(env, ts)
27	ENV *env;
28	db_timespec *ts;
29{
30	DBT lease_dbt;
31	DB_LOG *dblp;
32	DB_REP *db_rep;
33	LOG *lp;
34	REP *rep;
35	__rep_grant_info_args gi;
36	db_timespec mytime;
37	u_int8_t buf[__REP_GRANT_INFO_SIZE];
38	int master, ret;
39	size_t len;
40
41	db_rep = env->rep_handle;
42	rep = db_rep->region;
43	dblp = env->lg_handle;
44	lp = dblp->reginfo.primary;
45	timespecclear(&mytime);
46
47	/*
48	 * Get current time, and add in the (skewed) lease duration
49	 * time to send the grant to the master.
50	 */
51	__os_gettime(env, &mytime, 1);
52	timespecadd(&mytime, &rep->lease_duration);
53	REP_SYSTEM_LOCK(env);
54	/*
55	 * If we are in an election, we cannot grant the lease.
56	 * We need to check under the region mutex.
57	 */
58	if (IN_ELECTION(rep)) {
59		REP_SYSTEM_UNLOCK(env);
60		return (0);
61	}
62	if (timespeccmp(&mytime, &rep->grant_expire, >))
63		rep->grant_expire = mytime;
64	F_CLR(rep, REP_F_LEASE_EXPIRED);
65	REP_SYSTEM_UNLOCK(env);
66
67	/*
68	 * Send the LEASE_GRANT message with the current lease grant
69	 * no matter if we've actually extended the lease or not.
70	 */
71	gi.msg_sec = (u_int32_t)ts->tv_sec;
72	gi.msg_nsec = (u_int32_t)ts->tv_nsec;
73
74	if ((ret = __rep_grant_info_marshal(env, &gi, buf,
75	    __REP_GRANT_INFO_SIZE, &len)) != 0)
76		return (ret);
77	DB_INIT_DBT(lease_dbt, buf, len);
78	if ((master = rep->master_id) != DB_EID_INVALID)
79		(void)__rep_send_message(env, master, REP_LEASE_GRANT,
80		    &lp->max_perm_lsn, &lease_dbt, 0, 0);
81	return (0);
82}
83
84/*
85 * __rep_islease_granted -
86 *      Return 0 if this client has no outstanding lease granted.
87 *	Return 1 otherwise.
88 *	Caller must hold the REP_SYSTEM (region) mutex.
89 *
90 * PUBLIC: int __rep_islease_granted __P((ENV *));
91 */
92int
93__rep_islease_granted(env)
94	ENV *env;
95{
96	DB_REP *db_rep;
97	REP *rep;
98	db_timespec mytime;
99
100	db_rep = env->rep_handle;
101	rep = db_rep->region;
102	/*
103	 * Get current time and compare against our granted lease.
104	 */
105	timespecclear(&mytime);
106	__os_gettime(env, &mytime, 1);
107
108	return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0);
109}
110
111/*
112 * __rep_lease_table_alloc -
113 *	Allocate the lease table on a master.  Called with rep mutex
114 * held.  We need to acquire the env region mutex, so we need to
115 * make sure we never acquire those mutexes in the opposite order.
116 *
117 * PUBLIC: int __rep_lease_table_alloc __P((ENV *, u_int32_t));
118 */
119int
120__rep_lease_table_alloc(env, nsites)
121	ENV *env;
122	u_int32_t nsites;
123{
124	REGENV *renv;
125	REGINFO *infop;
126	REP *rep;
127	REP_LEASE_ENTRY *le, *table;
128	int *lease, ret;
129	u_int32_t i;
130
131	rep = env->rep_handle->region;
132
133	infop = env->reginfo;
134	renv = infop->primary;
135	MUTEX_LOCK(env, renv->mtx_regenv);
136	/*
137	 * If we have an old table from some other time, free it and
138	 * allocate ourselves a new one that is known to be for
139	 * the right number of sites.
140	 */
141	if (rep->lease_off != INVALID_ROFF) {
142		__env_alloc_free(infop,
143		    R_ADDR(infop, rep->lease_off));
144		rep->lease_off = INVALID_ROFF;
145	}
146	ret = __env_alloc(infop, (size_t)nsites * sizeof(REP_LEASE_ENTRY),
147	    &lease);
148	MUTEX_UNLOCK(env, renv->mtx_regenv);
149	if (ret != 0)
150		return (ret);
151	else
152		rep->lease_off = R_OFFSET(infop, lease);
153	table = R_ADDR(infop, rep->lease_off);
154	for (i = 0; i < nsites; i++) {
155		le = &table[i];
156		le->eid = DB_EID_INVALID;
157		timespecclear(&le->start_time);
158		timespecclear(&le->end_time);
159		ZERO_LSN(le->lease_lsn);
160	}
161	return (0);
162}
163
164/*
165 * __rep_lease_grant -
166 *	Handle incoming REP_LEASE_GRANT message on a master.
167 *
168 * PUBLIC: int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int));
169 */
170int
171__rep_lease_grant(env, rp, rec, eid)
172	ENV *env;
173	__rep_control_args *rp;
174	DBT *rec;
175	int eid;
176{
177	DB_REP *db_rep;
178	REP *rep;
179	__rep_grant_info_args gi;
180	REP_LEASE_ENTRY *le;
181	db_timespec msg_time;
182	int ret;
183
184	db_rep = env->rep_handle;
185	rep = db_rep->region;
186	if ((ret = __rep_grant_info_unmarshal(env,
187	    &gi, rec->data, rec->size, NULL)) != 0)
188		return (ret);
189	timespecset(&msg_time, gi.msg_sec, gi.msg_nsec);
190	le = NULL;
191
192	/*
193	 * Get current time, and add in the (skewed) lease duration
194	 * time to send the grant to the master.
195	 */
196	REP_SYSTEM_LOCK(env);
197	__rep_find_entry(env, rep, eid, &le);
198	/*
199	 * We either get back this site's entry, or an empty entry
200	 * that we need to initialize.
201	 */
202	DB_ASSERT(env, le != NULL);
203	/*
204	 * Update the entry if it is an empty entry or if the new
205	 * lease grant is a later start time than the current one.
206	 */
207	RPRINT(env, DB_VERB_REP_LEASE,
208	    (env, "lease_grant: grant msg time %lu %lu",
209	    (u_long)msg_time.tv_sec, (u_long)msg_time.tv_nsec));
210	if (le->eid == DB_EID_INVALID ||
211	    timespeccmp(&msg_time, &le->start_time, >)) {
212		le->eid = eid;
213		le->start_time = msg_time;
214		le->end_time = le->start_time;
215		timespecadd(&le->end_time, &rep->lease_duration);
216		RPRINT(env, DB_VERB_REP_LEASE, (env,
217    "lease_grant: eid %d, start %lu %lu, end %lu %lu, duration %lu %lu",
218    le->eid, (u_long)le->start_time.tv_sec, (u_long)le->start_time.tv_nsec,
219    (u_long)le->end_time.tv_sec, (u_long)le->end_time.tv_nsec,
220    (u_long)rep->lease_duration.tv_sec, (u_long)rep->lease_duration.tv_nsec));
221		/*
222		 * XXX Is this really true?  Could we have a lagging
223		 * record that has a later start time, but smaller
224		 * LSN than we have previously seen??
225		 */
226		DB_ASSERT(env, LOG_COMPARE(&rp->lsn, &le->lease_lsn) >= 0);
227		le->lease_lsn = rp->lsn;
228	}
229	REP_SYSTEM_UNLOCK(env);
230	return (0);
231}
232
233/*
234 * Find the entry for the given EID.  Or the first empty one.
235 */
236static void
237__rep_find_entry(env, rep, eid, lep)
238	ENV *env;
239	REP *rep;
240	int eid;
241	REP_LEASE_ENTRY **lep;
242{
243	REGINFO *infop;
244	REP_LEASE_ENTRY *le, *table;
245	u_int32_t i;
246
247	infop = env->reginfo;
248	table = R_ADDR(infop, rep->lease_off);
249
250	for (i = 0; i < rep->nsites; i++) {
251		le = &table[i];
252		/*
253		 * Find either the one that matches the client's
254		 * EID or the first empty one.
255		 */
256		if (le->eid == eid || le->eid == DB_EID_INVALID) {
257			*lep = le;
258			return;
259		}
260	}
261	return;
262}
263
264/*
265 * __rep_lease_check -
266 *      Return 0 if this master holds valid leases and can confirm
267 *	its mastership.  If leases are expired, an attempt is made
268 *	to refresh the leases.  If that fails, then return the
269 *	DB_REP_LEASE_EXPIRED error to the user.  No mutexes held.
270 *
271 * PUBLIC: int __rep_lease_check __P((ENV *, int));
272 */
273int
274__rep_lease_check(env, refresh)
275	ENV *env;
276	int refresh;
277{
278	DB_LOG *dblp;
279	DB_LSN lease_lsn;
280	DB_REP *db_rep;
281	LOG *lp;
282	REGINFO *infop;
283	REP *rep;
284	REP_LEASE_ENTRY *le, *table;
285	db_timespec curtime;
286	int ret, tries;
287	u_int32_t i, min_leases, valid_leases;
288
289	infop = env->reginfo;
290	tries = 0;
291	db_rep = env->rep_handle;
292	rep = db_rep->region;
293	dblp = env->lg_handle;
294	lp = dblp->reginfo.primary;
295	LOG_SYSTEM_LOCK(env);
296	lease_lsn = lp->max_perm_lsn;
297	LOG_SYSTEM_UNLOCK(env);
298
299retry:
300	REP_SYSTEM_LOCK(env);
301	min_leases = rep->nsites / 2;
302	ret = 0;
303	__os_gettime(env, &curtime, 1);
304	RPRINT(env, DB_VERB_REP_LEASE, (env,
305	"lease_check: try %d min_leases %lu curtime %lu %lu, maxLSN [%lu][%lu]",
306	    tries,
307	    (u_long)min_leases, (u_long)curtime.tv_sec,
308	    (u_long)curtime.tv_nsec,
309	    (u_long)lease_lsn.file,
310	    (u_long)lease_lsn.offset));
311	table = R_ADDR(infop, rep->lease_off);
312	for (i = 0, valid_leases = 0;
313	    i < rep->nsites && valid_leases < min_leases; i++) {
314		le = &table[i];
315		/*
316		 * Count this lease as valid if:
317		 * - It is a valid entry (has an EID).
318		 * - The lease has not expired.
319		 * - The LSN is up to date.
320		 */
321		if (le->eid != DB_EID_INVALID) {
322			RPRINT(env, DB_VERB_REP_LEASE, (env,
323		    "lease_check: valid %lu eid %d, lease_lsn [%lu][%lu]",
324			    (u_long)valid_leases, le->eid,
325			    (u_long)le->lease_lsn.file,
326			    (u_long)le->lease_lsn.offset));
327			RPRINT(env, DB_VERB_REP_LEASE,
328			    (env, "lease_check: endtime %lu %lu",
329			    (u_long)le->end_time.tv_sec,
330			    (u_long)le->end_time.tv_nsec));
331		}
332		if (le->eid != DB_EID_INVALID &&
333		    timespeccmp(&le->end_time, &curtime, >=) &&
334		    LOG_COMPARE(&le->lease_lsn, &lease_lsn) >= 0)
335			valid_leases++;
336	}
337	REP_SYSTEM_UNLOCK(env);
338
339	/*
340	 * Now see if we have enough.
341	 */
342	RPRINT(env, DB_VERB_REP_LEASE, (env, "valid %lu, min %lu",
343	    (u_long)valid_leases, (u_long)min_leases));
344	if (valid_leases < min_leases) {
345		if (!refresh)
346			ret = DB_REP_LEASE_EXPIRED;
347		else {
348			/*
349			 * If we are successful, we need to recheck the leases
350			 * because the lease grant messages may have raced with
351			 * the PERM acknowledgement.  Give the grant messages
352			 * a chance to arrive and be processed.
353			 */
354			if ((ret = __rep_lease_refresh(env)) == 0) {
355				if (tries <= LEASE_REFRESH_TRIES) {
356					/*
357					 * If we were successful sending, but
358					 * not in racing the message threads,
359					 * then yield the processor so that
360					 * the message threads get a chance
361					 * to run.
362					 */
363					if (tries > 0)
364						__os_yield(env, 1, 0);
365					tries++;
366					goto retry;
367				} else
368					ret = DB_REP_LEASE_EXPIRED;
369			}
370		}
371	}
372
373	if (ret == DB_REP_LEASE_EXPIRED)
374		RPRINT(env, DB_VERB_REP_LEASE, (env,
375		    "lease_check: Expired.  Only %lu valid",
376		    (u_long)valid_leases));
377	return (ret);
378}
379
380/*
381 * __rep_lease_refresh -
382 *	Find the last permanent record and send that out so that it
383 *	forces clients to grant their leases.
384 *
385 *	If there is no permanent record, this function cannot refresh
386 *	leases.  That should not happen because the master should write
387 *	a checkpoint when it starts, if there is no other perm record.
388 *
389 * PUBLIC: int __rep_lease_refresh __P((ENV *));
390 */
391int
392__rep_lease_refresh(env)
393	ENV *env;
394{
395	DBT rec;
396	DB_LOGC *logc;
397	DB_LSN lsn;
398	DB_REP *db_rep;
399	REP *rep;
400	int ret, t_ret;
401
402	db_rep = env->rep_handle;
403	rep = db_rep->region;
404
405	if ((ret = __log_cursor(env, &logc)) != 0)
406		return (ret);
407
408	memset(&rec, 0, sizeof(rec));
409	memset(&lsn, 0, sizeof(lsn));
410	/*
411	 * Use __rep_log_backup to find the last PERM record.
412	 */
413	if ((ret = __rep_log_backup(env, rep, logc, &lsn)) != 0) {
414		/*
415		 * If there is no PERM record, then we get DB_NOTFOUND.
416		 */
417		if (ret == DB_NOTFOUND)
418			ret = 0;
419		goto err;
420	}
421
422	if ((ret = __logc_get(logc, &lsn, &rec, DB_CURRENT)) != 0)
423		goto err;
424
425	(void)__rep_send_message(env, DB_EID_BROADCAST, REP_LOG, &lsn,
426	    &rec, REPCTL_PERM, 0);
427
428err:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
429		ret = t_ret;
430	return (ret);
431}
432
433/*
434 * __rep_lease_expire -
435 *	Proactively expire all leases granted to us.
436 * Assume the caller holds the REP_SYSTEM (region) mutex.
437 *
438 * PUBLIC: int __rep_lease_expire __P((ENV *));
439 */
440int
441__rep_lease_expire(env)
442	ENV *env;
443{
444	DB_REP *db_rep;
445	REGINFO *infop;
446	REP *rep;
447	REP_LEASE_ENTRY *le, *table;
448	int ret;
449	u_int32_t i;
450
451	ret = 0;
452	db_rep = env->rep_handle;
453	rep = db_rep->region;
454	infop = env->reginfo;
455
456	if (rep->lease_off != INVALID_ROFF) {
457		table = R_ADDR(infop, rep->lease_off);
458		/*
459		 * Expire all leases forcibly.  We are guaranteed that the
460		 * start_time for all leases are not in the future.  Therefore,
461		 * set the end_time to the start_time.
462		 */
463		for (i = 0; i < rep->nsites; i++) {
464			le = &table[i];
465			le->end_time = le->start_time;
466		}
467	}
468	return (ret);
469}
470
471/*
472 * __rep_lease_waittime -
473 *	Return the amount of time remaining on a granted lease.
474 * Assume the caller holds the REP_SYSTEM (region) mutex.
475 *
476 * PUBLIC: db_timeout_t __rep_lease_waittime __P((ENV *));
477 */
478db_timeout_t
479__rep_lease_waittime(env)
480	ENV *env;
481{
482	DB_REP *db_rep;
483	REP *rep;
484	db_timespec exptime, mytime;
485	db_timeout_t to;
486
487	db_rep = env->rep_handle;
488	rep = db_rep->region;
489	exptime = rep->grant_expire;
490	to = 0;
491	/*
492	 * If the lease has never been granted, we must wait a full
493	 * lease timeout because we could be freshly rebooted after
494	 * a crash and a lease could be granted from a previous
495	 * incarnation of this client.  However, if the lease has never
496	 * been granted, and this client has already waited a full
497	 * lease timeout, we know our lease cannot be granted and there
498	 * is no need to wait again.
499	 */
500	RPRINT(env, DB_VERB_REP_LEASE, (env,
501    "wait_time: grant_expire %lu %lu lease_to %lu",
502	    (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec,
503	    (u_long)rep->lease_timeout));
504	if (!timespecisset(&exptime)) {
505		if (!F_ISSET(rep, REP_F_LEASE_EXPIRED))
506			to = rep->lease_timeout;
507	} else {
508		__os_gettime(env, &mytime, 1);
509		RPRINT(env, DB_VERB_REP_LEASE, (env,
510    "wait_time: mytime %lu %lu, grant_expire %lu %lu",
511		    (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec,
512		    (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec));
513		if (timespeccmp(&mytime, &exptime, <=)) {
514			/*
515			 * If the current time is before the grant expiration
516			 * compute the difference and return remaining grant
517			 * time.
518			 */
519			timespecsub(&exptime, &mytime);
520			DB_TIMESPEC_TO_TIMEOUT(to, &exptime, 1);
521		}
522	}
523	return (to);
524}
525