1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2007,2008 Oracle. All rights reserved. 5 * 6 * $Id: rep_lease.c,v 12.23 2008/01/11 21:49:26 sue Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/log.h" 13 14static void __rep_find_entry __P((ENV *, REP *, int, REP_LEASE_ENTRY **)); 15 16/* 17 * __rep_update_grant - 18 * Update a client's lease grant for this perm record 19 * and send the grant to the master. Caller must 20 * hold the mtx_clientdb mutex. Timespec given is in 21 * host local format. 22 * 23 * PUBLIC: int __rep_update_grant __P((ENV *, db_timespec *)); 24 */ 25int 26__rep_update_grant(env, ts) 27 ENV *env; 28 db_timespec *ts; 29{ 30 DBT lease_dbt; 31 DB_LOG *dblp; 32 DB_REP *db_rep; 33 LOG *lp; 34 REP *rep; 35 __rep_grant_info_args gi; 36 db_timespec mytime; 37 u_int8_t buf[__REP_GRANT_INFO_SIZE]; 38 int ret; 39 size_t len; 40 41 db_rep = env->rep_handle; 42 rep = db_rep->region; 43 dblp = env->lg_handle; 44 lp = dblp->reginfo.primary; 45 timespecclear(&mytime); 46 47 /* 48 * Get current time, and add in the (skewed) lease duration 49 * time to send the grant to the master. 50 */ 51 __os_gettime(env, &mytime, 1); 52 timespecadd(&mytime, &rep->lease_duration); 53 REP_SYSTEM_LOCK(env); 54 /* 55 * If we are in an election, we cannot grant the lease. 56 * We need to check under the region mutex. 57 */ 58 if (IN_ELECTION(rep)) { 59 REP_SYSTEM_UNLOCK(env); 60 return (0); 61 } 62 if (timespeccmp(&mytime, &rep->grant_expire, >)) 63 rep->grant_expire = mytime; 64 REP_SYSTEM_UNLOCK(env); 65 66 /* 67 * Send the LEASE_GRANT message with the current lease grant 68 * no matter if we've actually extended the lease or not. 69 */ 70 gi.msg_sec = (u_int32_t)ts->tv_sec; 71 gi.msg_nsec = (u_int32_t)ts->tv_nsec; 72 73 if ((ret = __rep_grant_info_marshal(env, &gi, buf, 74 __REP_GRANT_INFO_SIZE, &len)) != 0) 75 return (ret); 76 DB_INIT_DBT(lease_dbt, buf, len); 77 (void)__rep_send_message(env, rep->master_id, REP_LEASE_GRANT, 78 &lp->max_perm_lsn, &lease_dbt, 0, 0); 79 return (0); 80} 81 82/* 83 * __rep_islease_granted - 84 * Return 0 if this client has no outstanding lease granted. 85 * Return 1 otherwise. 86 * Caller must hold the REP_SYSTEM (region) mutex. 87 * 88 * PUBLIC: int __rep_islease_granted __P((ENV *)); 89 */ 90int 91__rep_islease_granted(env) 92 ENV *env; 93{ 94 DB_REP *db_rep; 95 REP *rep; 96 db_timespec mytime; 97 98 db_rep = env->rep_handle; 99 rep = db_rep->region; 100 /* 101 * Get current time and compare against our granted lease. 102 */ 103 timespecclear(&mytime); 104 __os_gettime(env, &mytime, 1); 105 106 return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0); 107} 108 109/* 110 * __rep_lease_table_alloc - 111 * Allocate the lease table on a master. Called with rep mutex 112 * held. We need to acquire the env region mutex, so we need to 113 * make sure we never acquire those mutexes in the opposite order. 114 * 115 * PUBLIC: int __rep_lease_table_alloc __P((ENV *, u_int32_t)); 116 */ 117int 118__rep_lease_table_alloc(env, nsites) 119 ENV *env; 120 u_int32_t nsites; 121{ 122 REGENV *renv; 123 REGINFO *infop; 124 REP *rep; 125 REP_LEASE_ENTRY *le, *table; 126 int *lease, ret; 127 u_int32_t i; 128 129 rep = env->rep_handle->region; 130 131 infop = env->reginfo; 132 renv = infop->primary; 133 MUTEX_LOCK(env, renv->mtx_regenv); 134 if ((ret = __env_alloc(infop, (size_t)nsites * sizeof(REP_LEASE_ENTRY), 135 &lease)) == 0) { 136 if (rep->lease_off != INVALID_ROFF) 137 __env_alloc_free(infop, 138 R_ADDR(infop, rep->lease_off)); 139 rep->lease_off = R_OFFSET(infop, lease); 140 } 141 MUTEX_UNLOCK(env, renv->mtx_regenv); 142 table = R_ADDR(infop, rep->lease_off); 143 for (i = 0; i < nsites; i++) { 144 le = &table[i]; 145 le->eid = DB_EID_INVALID; 146 timespecclear(&le->start_time); 147 timespecclear(&le->end_time); 148 ZERO_LSN(le->lease_lsn); 149 } 150 return (ret); 151} 152 153/* 154 * __rep_lease_grant - 155 * Handle incoming REP_LEASE_GRANT message on a master. 156 * 157 * PUBLIC: int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int)); 158 */ 159int 160__rep_lease_grant(env, rp, rec, eid) 161 ENV *env; 162 __rep_control_args *rp; 163 DBT *rec; 164 int eid; 165{ 166 DB_REP *db_rep; 167 REP *rep; 168 __rep_grant_info_args gi; 169 REP_LEASE_ENTRY *le; 170 db_timespec msg_time; 171 int ret; 172 173 db_rep = env->rep_handle; 174 rep = db_rep->region; 175 if ((ret = __rep_grant_info_unmarshal(env, 176 &gi, rec->data, rec->size, NULL)) != 0) 177 return (ret); 178 timespecset(&msg_time, gi.msg_sec, gi.msg_nsec); 179 le = NULL; 180 181 /* 182 * Get current time, and add in the (skewed) lease duration 183 * time to send the grant to the master. 184 */ 185 REP_SYSTEM_LOCK(env); 186 __rep_find_entry(env, rep, eid, &le); 187 /* 188 * We either get back this site's entry, or an empty entry 189 * that we need to initialize. 190 */ 191 DB_ASSERT(env, le != NULL); 192 /* 193 * Update the entry if it is an empty entry or if the new 194 * lease grant is a later start time than the current one. 195 */ 196 RPRINT(env, DB_VERB_REP_LEASE, 197 (env, "lease_grant: grant msg time %lu %lu", 198 (u_long)msg_time.tv_sec, (u_long)msg_time.tv_nsec)); 199 if (le->eid == DB_EID_INVALID || 200 timespeccmp(&msg_time, &le->start_time, >)) { 201 le->eid = eid; 202 le->start_time = msg_time; 203 le->end_time = le->start_time; 204 timespecadd(&le->end_time, &rep->lease_duration); 205 RPRINT(env, DB_VERB_REP_LEASE, (env, 206 "lease_grant: eid %d, start %lu %lu, end %lu %lu, duration %lu %lu", 207 le->eid, (u_long)le->start_time.tv_sec, (u_long)le->start_time.tv_nsec, 208 (u_long)le->end_time.tv_sec, (u_long)le->end_time.tv_nsec, 209 (u_long)rep->lease_duration.tv_sec, (u_long)rep->lease_duration.tv_nsec)); 210 /* 211 * XXX Is this really true? Could we have a lagging 212 * record that has a later start time, but smaller 213 * LSN than we have previously seen?? 214 */ 215 DB_ASSERT(env, LOG_COMPARE(&rp->lsn, &le->lease_lsn) >= 0); 216 le->lease_lsn = rp->lsn; 217 } 218 REP_SYSTEM_UNLOCK(env); 219 return (0); 220} 221 222/* 223 * Find the entry for the given EID. Or the first empty one. 224 */ 225static void 226__rep_find_entry(env, rep, eid, lep) 227 ENV *env; 228 REP *rep; 229 int eid; 230 REP_LEASE_ENTRY **lep; 231{ 232 REGINFO *infop; 233 REP_LEASE_ENTRY *le, *table; 234 u_int32_t i; 235 236 infop = env->reginfo; 237 table = R_ADDR(infop, rep->lease_off); 238 239 for (i = 0; i < rep->nsites; i++) { 240 le = &table[i]; 241 /* 242 * Find either the one that matches the client's 243 * EID or the first empty one. 244 */ 245 if (le->eid == eid || le->eid == DB_EID_INVALID) { 246 *lep = le; 247 return; 248 } 249 } 250 return; 251} 252 253/* 254 * __rep_lease_check - 255 * Return 0 if this master holds valid leases and can confirm 256 * its mastership. If leases are expired, an attempt is made 257 * to refresh the leases. If that fails, then return the 258 * DB_REP_LEASE_EXPIRED error to the user. No mutexes held. 259 * 260 * PUBLIC: int __rep_lease_check __P((ENV *, int)); 261 */ 262int 263__rep_lease_check(env, refresh) 264 ENV *env; 265 int refresh; 266{ 267 DB_LOG *dblp; 268 DB_LSN lease_lsn; 269 DB_REP *db_rep; 270 LOG *lp; 271 REGINFO *infop; 272 REP *rep; 273 REP_LEASE_ENTRY *le, *table; 274 db_timespec curtime; 275 int ret, tries; 276 u_int32_t i, min_leases, valid_leases; 277 278 infop = env->reginfo; 279 tries = 0; 280retry: 281 ret = 0; 282 db_rep = env->rep_handle; 283 rep = db_rep->region; 284 dblp = env->lg_handle; 285 lp = dblp->reginfo.primary; 286 LOG_SYSTEM_LOCK(env); 287 lease_lsn = lp->max_perm_lsn; 288 LOG_SYSTEM_UNLOCK(env); 289 REP_SYSTEM_LOCK(env); 290 min_leases = rep->nsites / 2; 291 292 __os_gettime(env, &curtime, 1); 293 RPRINT(env, DB_VERB_REP_LEASE, 294 (env, "lease_check: min_leases %lu curtime %lu %lu", 295 (u_long)min_leases, (u_long)curtime.tv_sec, 296 (u_long)curtime.tv_nsec)); 297 table = R_ADDR(infop, rep->lease_off); 298 for (i = 0, valid_leases = 0; 299 i < rep->nsites && valid_leases < min_leases; i++) { 300 le = &table[i]; 301 /* 302 * Count this lease as valid if: 303 * - It is a valid entry (has an EID). 304 * - The lease has not expired. 305 * - The LSN is up to date. 306 */ 307 if (le->eid != DB_EID_INVALID) { 308 RPRINT(env, DB_VERB_REP_LEASE, (env, 309 "lease_check: valid %lu eid %d, lease_lsn [%lu][%lu]", 310 (u_long)valid_leases, le->eid, 311 (u_long)le->lease_lsn.file, 312 (u_long)le->lease_lsn.offset)); 313 RPRINT(env, DB_VERB_REP_LEASE, 314 (env, "lease_check: endtime %lu %lu", 315 (u_long)le->end_time.tv_sec, 316 (u_long)le->end_time.tv_nsec)); 317 } 318 if (le->eid != DB_EID_INVALID && 319 timespeccmp(&le->end_time, &curtime, >=) && 320 LOG_COMPARE(&le->lease_lsn, &lease_lsn) == 0) 321 valid_leases++; 322 } 323 REP_SYSTEM_UNLOCK(env); 324 325 /* 326 * Now see if we have enough. 327 */ 328 RPRINT(env, DB_VERB_REP_LEASE, (env, "valid %lu, min %lu", 329 (u_long)valid_leases, (u_long)min_leases)); 330 if (valid_leases < min_leases) { 331 if (!refresh) 332 ret = DB_REP_LEASE_EXPIRED; 333 else { 334 /* 335 * If we are successful, we need to recheck the leases 336 * because the lease grant messages may have raced with 337 * the PERM acknowledgement. Give the grant messages 338 * a chance to arrive and be processed. 339 */ 340 if ((ret = __rep_lease_refresh(env)) == 0) { 341 if (tries <= LEASE_REFRESH_TRIES) { 342 /* 343 * If we were successful sending, but 344 * not in racing the message threads, 345 * then yield the processor so that 346 * the message threads get a chance 347 * to run. 348 */ 349 if (tries > 0) 350 __os_yield(env, 1, 0); 351 tries++; 352 goto retry; 353 } else 354 ret = DB_REP_LEASE_EXPIRED; 355 } 356 } 357 } 358 359 return (ret); 360} 361 362/* 363 * __rep_lease_refresh - 364 * Find the last permanent record and send that out so that it 365 * forces clients to grant their leases. 366 * 367 * PUBLIC: int __rep_lease_refresh __P((ENV *)); 368 */ 369int 370__rep_lease_refresh(env) 371 ENV *env; 372{ 373 DBT rec; 374 DB_LOGC *logc; 375 DB_LSN lsn; 376 DB_REP *db_rep; 377 REP *rep; 378 int ret, t_ret; 379 380 db_rep = env->rep_handle; 381 rep = db_rep->region; 382 383 if ((ret = __log_cursor(env, &logc)) != 0) 384 return (ret); 385 386 memset(&rec, 0, sizeof(rec)); 387 memset(&lsn, 0, sizeof(lsn)); 388 /* 389 * Use __rep_log_backup to find the last PERM record. 390 */ 391 if ((ret = __rep_log_backup(env, rep, logc, &lsn)) != 0) 392 goto err; 393 394 if ((ret = __logc_get(logc, &lsn, &rec, DB_CURRENT)) != 0) 395 goto err; 396 397 if ((ret = __rep_send_message(env, 398 DB_EID_BROADCAST, REP_LOG, &lsn, &rec, REPCTL_PERM, 0)) != 0) { 399 /* 400 * If we do not get an ack, we expire leases. 401 */ 402 (void)__rep_lease_expire(env, 0); 403 ret = DB_REP_LEASE_EXPIRED; 404 } 405 406err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) 407 ret = t_ret; 408 return (ret); 409} 410 411/* 412 * __rep_lease_expire - 413 * Proactively expire all leases granted to us. 414 * 415 * PUBLIC: int __rep_lease_expire __P((ENV *, int)); 416 */ 417int 418__rep_lease_expire(env, locked) 419 ENV *env; 420 int locked; 421{ 422 DB_REP *db_rep; 423 REGINFO *infop; 424 REP *rep; 425 REP_LEASE_ENTRY *le, *table; 426 int ret; 427 u_int32_t i; 428 429 ret = 0; 430 db_rep = env->rep_handle; 431 rep = db_rep->region; 432 infop = env->reginfo; 433 434 if (!locked) 435 REP_SYSTEM_LOCK(env); 436 if (rep->lease_off != INVALID_ROFF) { 437 table = R_ADDR(infop, rep->lease_off); 438 /* 439 * Expire all leases forcibly. We are guaranteed that the 440 * start_time for all leases are not in the future. Therefore, 441 * set the end_time to the start_time. 442 */ 443 for (i = 0; i < rep->nsites; i++) { 444 le = &table[i]; 445 le->end_time = le->start_time; 446 } 447 } 448 if (!locked) 449 REP_SYSTEM_UNLOCK(env); 450 return (ret); 451} 452 453/* 454 * __rep_lease_waittime - 455 * Return the amount of time remaining on a granted lease. 456 * Assume the caller holds the REP_SYSTEM (region) mutex. 457 * 458 * PUBLIC: db_timeout_t __rep_lease_waittime __P((ENV *)); 459 */ 460db_timeout_t 461__rep_lease_waittime(env) 462 ENV *env; 463{ 464 DB_REP *db_rep; 465 REP *rep; 466 db_timespec exptime, mytime; 467 db_timeout_t to; 468 469 db_rep = env->rep_handle; 470 rep = db_rep->region; 471 exptime = rep->grant_expire; 472 to = 0; 473 /* 474 * If the lease has never been granted, we must wait a full 475 * lease timeout because we could be freshly rebooted after 476 * a crash and a lease could be granted from a previous 477 * incarnation of this client. 478 */ 479 RPRINT(env, DB_VERB_REP_LEASE, (env, 480 "wait_time: grant_expire %lu %lu lease_to %lu", 481 (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec, 482 (u_long)rep->lease_timeout)); 483 if (!timespecisset(&exptime)) 484 to = rep->lease_timeout; 485 else { 486 __os_gettime(env, &mytime, 1); 487 RPRINT(env, DB_VERB_REP_LEASE, (env, 488 "wait_time: mytime %lu %lu, grant_expire %lu %lu", 489 (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec, 490 (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec)); 491 if (timespeccmp(&mytime, &exptime, <=)) { 492 /* 493 * If the current time is before the grant expiration 494 * compute the difference and return remaining grant 495 * time. 496 */ 497 timespecsub(&exptime, &mytime); 498 DB_TIMESPEC_TO_TIMEOUT(to, &exptime, 1); 499 } 500 } 501 return (to); 502} 503