1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2007-2009 Oracle. All rights reserved. 5 * 6 * $Id$ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/log.h" 13 14static void __rep_find_entry __P((ENV *, REP *, int, REP_LEASE_ENTRY **)); 15 16/* 17 * __rep_update_grant - 18 * Update a client's lease grant for this perm record 19 * and send the grant to the master. Caller must 20 * hold the mtx_clientdb mutex. Timespec given is in 21 * host local format. 22 * 23 * PUBLIC: int __rep_update_grant __P((ENV *, db_timespec *)); 24 */ 25int 26__rep_update_grant(env, ts) 27 ENV *env; 28 db_timespec *ts; 29{ 30 DBT lease_dbt; 31 DB_LOG *dblp; 32 DB_REP *db_rep; 33 LOG *lp; 34 REP *rep; 35 __rep_grant_info_args gi; 36 db_timespec mytime; 37 u_int8_t buf[__REP_GRANT_INFO_SIZE]; 38 int master, ret; 39 size_t len; 40 41 db_rep = env->rep_handle; 42 rep = db_rep->region; 43 dblp = env->lg_handle; 44 lp = dblp->reginfo.primary; 45 timespecclear(&mytime); 46 47 /* 48 * Get current time, and add in the (skewed) lease duration 49 * time to send the grant to the master. 50 */ 51 __os_gettime(env, &mytime, 1); 52 timespecadd(&mytime, &rep->lease_duration); 53 REP_SYSTEM_LOCK(env); 54 /* 55 * If we are in an election, we cannot grant the lease. 56 * We need to check under the region mutex. 57 */ 58 if (IN_ELECTION(rep)) { 59 REP_SYSTEM_UNLOCK(env); 60 return (0); 61 } 62 if (timespeccmp(&mytime, &rep->grant_expire, >)) 63 rep->grant_expire = mytime; 64 F_CLR(rep, REP_F_LEASE_EXPIRED); 65 REP_SYSTEM_UNLOCK(env); 66 67 /* 68 * Send the LEASE_GRANT message with the current lease grant 69 * no matter if we've actually extended the lease or not. 70 */ 71 gi.msg_sec = (u_int32_t)ts->tv_sec; 72 gi.msg_nsec = (u_int32_t)ts->tv_nsec; 73 74 if ((ret = __rep_grant_info_marshal(env, &gi, buf, 75 __REP_GRANT_INFO_SIZE, &len)) != 0) 76 return (ret); 77 DB_INIT_DBT(lease_dbt, buf, len); 78 if ((master = rep->master_id) != DB_EID_INVALID) 79 (void)__rep_send_message(env, master, REP_LEASE_GRANT, 80 &lp->max_perm_lsn, &lease_dbt, 0, 0); 81 return (0); 82} 83 84/* 85 * __rep_islease_granted - 86 * Return 0 if this client has no outstanding lease granted. 87 * Return 1 otherwise. 88 * Caller must hold the REP_SYSTEM (region) mutex. 89 * 90 * PUBLIC: int __rep_islease_granted __P((ENV *)); 91 */ 92int 93__rep_islease_granted(env) 94 ENV *env; 95{ 96 DB_REP *db_rep; 97 REP *rep; 98 db_timespec mytime; 99 100 db_rep = env->rep_handle; 101 rep = db_rep->region; 102 /* 103 * Get current time and compare against our granted lease. 104 */ 105 timespecclear(&mytime); 106 __os_gettime(env, &mytime, 1); 107 108 return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0); 109} 110 111/* 112 * __rep_lease_table_alloc - 113 * Allocate the lease table on a master. Called with rep mutex 114 * held. We need to acquire the env region mutex, so we need to 115 * make sure we never acquire those mutexes in the opposite order. 116 * 117 * PUBLIC: int __rep_lease_table_alloc __P((ENV *, u_int32_t)); 118 */ 119int 120__rep_lease_table_alloc(env, nsites) 121 ENV *env; 122 u_int32_t nsites; 123{ 124 REGENV *renv; 125 REGINFO *infop; 126 REP *rep; 127 REP_LEASE_ENTRY *le, *table; 128 int *lease, ret; 129 u_int32_t i; 130 131 rep = env->rep_handle->region; 132 133 infop = env->reginfo; 134 renv = infop->primary; 135 MUTEX_LOCK(env, renv->mtx_regenv); 136 /* 137 * If we have an old table from some other time, free it and 138 * allocate ourselves a new one that is known to be for 139 * the right number of sites. 140 */ 141 if (rep->lease_off != INVALID_ROFF) { 142 __env_alloc_free(infop, 143 R_ADDR(infop, rep->lease_off)); 144 rep->lease_off = INVALID_ROFF; 145 } 146 ret = __env_alloc(infop, (size_t)nsites * sizeof(REP_LEASE_ENTRY), 147 &lease); 148 MUTEX_UNLOCK(env, renv->mtx_regenv); 149 if (ret != 0) 150 return (ret); 151 else 152 rep->lease_off = R_OFFSET(infop, lease); 153 table = R_ADDR(infop, rep->lease_off); 154 for (i = 0; i < nsites; i++) { 155 le = &table[i]; 156 le->eid = DB_EID_INVALID; 157 timespecclear(&le->start_time); 158 timespecclear(&le->end_time); 159 ZERO_LSN(le->lease_lsn); 160 } 161 return (0); 162} 163 164/* 165 * __rep_lease_grant - 166 * Handle incoming REP_LEASE_GRANT message on a master. 167 * 168 * PUBLIC: int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int)); 169 */ 170int 171__rep_lease_grant(env, rp, rec, eid) 172 ENV *env; 173 __rep_control_args *rp; 174 DBT *rec; 175 int eid; 176{ 177 DB_REP *db_rep; 178 REP *rep; 179 __rep_grant_info_args gi; 180 REP_LEASE_ENTRY *le; 181 db_timespec msg_time; 182 int ret; 183 184 db_rep = env->rep_handle; 185 rep = db_rep->region; 186 if ((ret = __rep_grant_info_unmarshal(env, 187 &gi, rec->data, rec->size, NULL)) != 0) 188 return (ret); 189 timespecset(&msg_time, gi.msg_sec, gi.msg_nsec); 190 le = NULL; 191 192 /* 193 * Get current time, and add in the (skewed) lease duration 194 * time to send the grant to the master. 195 */ 196 REP_SYSTEM_LOCK(env); 197 __rep_find_entry(env, rep, eid, &le); 198 /* 199 * We either get back this site's entry, or an empty entry 200 * that we need to initialize. 201 */ 202 DB_ASSERT(env, le != NULL); 203 /* 204 * Update the entry if it is an empty entry or if the new 205 * lease grant is a later start time than the current one. 206 */ 207 RPRINT(env, DB_VERB_REP_LEASE, 208 (env, "lease_grant: grant msg time %lu %lu", 209 (u_long)msg_time.tv_sec, (u_long)msg_time.tv_nsec)); 210 if (le->eid == DB_EID_INVALID || 211 timespeccmp(&msg_time, &le->start_time, >)) { 212 le->eid = eid; 213 le->start_time = msg_time; 214 le->end_time = le->start_time; 215 timespecadd(&le->end_time, &rep->lease_duration); 216 RPRINT(env, DB_VERB_REP_LEASE, (env, 217 "lease_grant: eid %d, start %lu %lu, end %lu %lu, duration %lu %lu", 218 le->eid, (u_long)le->start_time.tv_sec, (u_long)le->start_time.tv_nsec, 219 (u_long)le->end_time.tv_sec, (u_long)le->end_time.tv_nsec, 220 (u_long)rep->lease_duration.tv_sec, (u_long)rep->lease_duration.tv_nsec)); 221 /* 222 * XXX Is this really true? Could we have a lagging 223 * record that has a later start time, but smaller 224 * LSN than we have previously seen?? 225 */ 226 DB_ASSERT(env, LOG_COMPARE(&rp->lsn, &le->lease_lsn) >= 0); 227 le->lease_lsn = rp->lsn; 228 } 229 REP_SYSTEM_UNLOCK(env); 230 return (0); 231} 232 233/* 234 * Find the entry for the given EID. Or the first empty one. 235 */ 236static void 237__rep_find_entry(env, rep, eid, lep) 238 ENV *env; 239 REP *rep; 240 int eid; 241 REP_LEASE_ENTRY **lep; 242{ 243 REGINFO *infop; 244 REP_LEASE_ENTRY *le, *table; 245 u_int32_t i; 246 247 infop = env->reginfo; 248 table = R_ADDR(infop, rep->lease_off); 249 250 for (i = 0; i < rep->nsites; i++) { 251 le = &table[i]; 252 /* 253 * Find either the one that matches the client's 254 * EID or the first empty one. 255 */ 256 if (le->eid == eid || le->eid == DB_EID_INVALID) { 257 *lep = le; 258 return; 259 } 260 } 261 return; 262} 263 264/* 265 * __rep_lease_check - 266 * Return 0 if this master holds valid leases and can confirm 267 * its mastership. If leases are expired, an attempt is made 268 * to refresh the leases. If that fails, then return the 269 * DB_REP_LEASE_EXPIRED error to the user. No mutexes held. 270 * 271 * PUBLIC: int __rep_lease_check __P((ENV *, int)); 272 */ 273int 274__rep_lease_check(env, refresh) 275 ENV *env; 276 int refresh; 277{ 278 DB_LOG *dblp; 279 DB_LSN lease_lsn; 280 DB_REP *db_rep; 281 LOG *lp; 282 REGINFO *infop; 283 REP *rep; 284 REP_LEASE_ENTRY *le, *table; 285 db_timespec curtime; 286 int ret, tries; 287 u_int32_t i, min_leases, valid_leases; 288 289 infop = env->reginfo; 290 tries = 0; 291 db_rep = env->rep_handle; 292 rep = db_rep->region; 293 dblp = env->lg_handle; 294 lp = dblp->reginfo.primary; 295 LOG_SYSTEM_LOCK(env); 296 lease_lsn = lp->max_perm_lsn; 297 LOG_SYSTEM_UNLOCK(env); 298 299retry: 300 REP_SYSTEM_LOCK(env); 301 min_leases = rep->nsites / 2; 302 ret = 0; 303 __os_gettime(env, &curtime, 1); 304 RPRINT(env, DB_VERB_REP_LEASE, (env, 305 "lease_check: try %d min_leases %lu curtime %lu %lu, maxLSN [%lu][%lu]", 306 tries, 307 (u_long)min_leases, (u_long)curtime.tv_sec, 308 (u_long)curtime.tv_nsec, 309 (u_long)lease_lsn.file, 310 (u_long)lease_lsn.offset)); 311 table = R_ADDR(infop, rep->lease_off); 312 for (i = 0, valid_leases = 0; 313 i < rep->nsites && valid_leases < min_leases; i++) { 314 le = &table[i]; 315 /* 316 * Count this lease as valid if: 317 * - It is a valid entry (has an EID). 318 * - The lease has not expired. 319 * - The LSN is up to date. 320 */ 321 if (le->eid != DB_EID_INVALID) { 322 RPRINT(env, DB_VERB_REP_LEASE, (env, 323 "lease_check: valid %lu eid %d, lease_lsn [%lu][%lu]", 324 (u_long)valid_leases, le->eid, 325 (u_long)le->lease_lsn.file, 326 (u_long)le->lease_lsn.offset)); 327 RPRINT(env, DB_VERB_REP_LEASE, 328 (env, "lease_check: endtime %lu %lu", 329 (u_long)le->end_time.tv_sec, 330 (u_long)le->end_time.tv_nsec)); 331 } 332 if (le->eid != DB_EID_INVALID && 333 timespeccmp(&le->end_time, &curtime, >=) && 334 LOG_COMPARE(&le->lease_lsn, &lease_lsn) >= 0) 335 valid_leases++; 336 } 337 REP_SYSTEM_UNLOCK(env); 338 339 /* 340 * Now see if we have enough. 341 */ 342 RPRINT(env, DB_VERB_REP_LEASE, (env, "valid %lu, min %lu", 343 (u_long)valid_leases, (u_long)min_leases)); 344 if (valid_leases < min_leases) { 345 if (!refresh) 346 ret = DB_REP_LEASE_EXPIRED; 347 else { 348 /* 349 * If we are successful, we need to recheck the leases 350 * because the lease grant messages may have raced with 351 * the PERM acknowledgement. Give the grant messages 352 * a chance to arrive and be processed. 353 */ 354 if ((ret = __rep_lease_refresh(env)) == 0) { 355 if (tries <= LEASE_REFRESH_TRIES) { 356 /* 357 * If we were successful sending, but 358 * not in racing the message threads, 359 * then yield the processor so that 360 * the message threads get a chance 361 * to run. 362 */ 363 if (tries > 0) 364 __os_yield(env, 1, 0); 365 tries++; 366 goto retry; 367 } else 368 ret = DB_REP_LEASE_EXPIRED; 369 } 370 } 371 } 372 373 if (ret == DB_REP_LEASE_EXPIRED) 374 RPRINT(env, DB_VERB_REP_LEASE, (env, 375 "lease_check: Expired. Only %lu valid", 376 (u_long)valid_leases)); 377 return (ret); 378} 379 380/* 381 * __rep_lease_refresh - 382 * Find the last permanent record and send that out so that it 383 * forces clients to grant their leases. 384 * 385 * If there is no permanent record, this function cannot refresh 386 * leases. That should not happen because the master should write 387 * a checkpoint when it starts, if there is no other perm record. 388 * 389 * PUBLIC: int __rep_lease_refresh __P((ENV *)); 390 */ 391int 392__rep_lease_refresh(env) 393 ENV *env; 394{ 395 DBT rec; 396 DB_LOGC *logc; 397 DB_LSN lsn; 398 DB_REP *db_rep; 399 REP *rep; 400 int ret, t_ret; 401 402 db_rep = env->rep_handle; 403 rep = db_rep->region; 404 405 if ((ret = __log_cursor(env, &logc)) != 0) 406 return (ret); 407 408 memset(&rec, 0, sizeof(rec)); 409 memset(&lsn, 0, sizeof(lsn)); 410 /* 411 * Use __rep_log_backup to find the last PERM record. 412 */ 413 if ((ret = __rep_log_backup(env, rep, logc, &lsn)) != 0) { 414 /* 415 * If there is no PERM record, then we get DB_NOTFOUND. 416 */ 417 if (ret == DB_NOTFOUND) 418 ret = 0; 419 goto err; 420 } 421 422 if ((ret = __logc_get(logc, &lsn, &rec, DB_CURRENT)) != 0) 423 goto err; 424 425 (void)__rep_send_message(env, DB_EID_BROADCAST, REP_LOG, &lsn, 426 &rec, REPCTL_PERM, 0); 427 428err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) 429 ret = t_ret; 430 return (ret); 431} 432 433/* 434 * __rep_lease_expire - 435 * Proactively expire all leases granted to us. 436 * Assume the caller holds the REP_SYSTEM (region) mutex. 437 * 438 * PUBLIC: int __rep_lease_expire __P((ENV *)); 439 */ 440int 441__rep_lease_expire(env) 442 ENV *env; 443{ 444 DB_REP *db_rep; 445 REGINFO *infop; 446 REP *rep; 447 REP_LEASE_ENTRY *le, *table; 448 int ret; 449 u_int32_t i; 450 451 ret = 0; 452 db_rep = env->rep_handle; 453 rep = db_rep->region; 454 infop = env->reginfo; 455 456 if (rep->lease_off != INVALID_ROFF) { 457 table = R_ADDR(infop, rep->lease_off); 458 /* 459 * Expire all leases forcibly. We are guaranteed that the 460 * start_time for all leases are not in the future. Therefore, 461 * set the end_time to the start_time. 462 */ 463 for (i = 0; i < rep->nsites; i++) { 464 le = &table[i]; 465 le->end_time = le->start_time; 466 } 467 } 468 return (ret); 469} 470 471/* 472 * __rep_lease_waittime - 473 * Return the amount of time remaining on a granted lease. 474 * Assume the caller holds the REP_SYSTEM (region) mutex. 475 * 476 * PUBLIC: db_timeout_t __rep_lease_waittime __P((ENV *)); 477 */ 478db_timeout_t 479__rep_lease_waittime(env) 480 ENV *env; 481{ 482 DB_REP *db_rep; 483 REP *rep; 484 db_timespec exptime, mytime; 485 db_timeout_t to; 486 487 db_rep = env->rep_handle; 488 rep = db_rep->region; 489 exptime = rep->grant_expire; 490 to = 0; 491 /* 492 * If the lease has never been granted, we must wait a full 493 * lease timeout because we could be freshly rebooted after 494 * a crash and a lease could be granted from a previous 495 * incarnation of this client. However, if the lease has never 496 * been granted, and this client has already waited a full 497 * lease timeout, we know our lease cannot be granted and there 498 * is no need to wait again. 499 */ 500 RPRINT(env, DB_VERB_REP_LEASE, (env, 501 "wait_time: grant_expire %lu %lu lease_to %lu", 502 (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec, 503 (u_long)rep->lease_timeout)); 504 if (!timespecisset(&exptime)) { 505 if (!F_ISSET(rep, REP_F_LEASE_EXPIRED)) 506 to = rep->lease_timeout; 507 } else { 508 __os_gettime(env, &mytime, 1); 509 RPRINT(env, DB_VERB_REP_LEASE, (env, 510 "wait_time: mytime %lu %lu, grant_expire %lu %lu", 511 (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec, 512 (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec)); 513 if (timespeccmp(&mytime, &exptime, <=)) { 514 /* 515 * If the current time is before the grant expiration 516 * compute the difference and return remaining grant 517 * time. 518 */ 519 timespecsub(&exptime, &mytime); 520 DB_TIMESPEC_TO_TIMEOUT(to, &exptime, 1); 521 } 522 } 523 return (to); 524} 525