1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2006,2008 Oracle. All rights reserved. 5 * 6 * $Id: mp_resize.c,v 12.14 2008/03/13 15:21:21 mbrey Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/mp.h" 13#include "dbinc/txn.h" 14 15static int __memp_add_bucket __P((DB_MPOOL *)); 16static int __memp_add_region __P((DB_MPOOL *)); 17static int __memp_map_regions __P((DB_MPOOL *)); 18static int __memp_merge_buckets 19 __P((DB_MPOOL *, u_int32_t, u_int32_t, u_int32_t)); 20static int __memp_remove_bucket __P((DB_MPOOL *)); 21static int __memp_remove_region __P((DB_MPOOL *)); 22 23/* 24 * PUBLIC: int __memp_get_bucket __P((ENV *, 25 * PUBLIC: MPOOLFILE *, db_pgno_t, REGINFO **, DB_MPOOL_HASH **)); 26 */ 27int 28__memp_get_bucket(env, mfp, pgno, infopp, hpp) 29 ENV *env; 30 MPOOLFILE *mfp; 31 db_pgno_t pgno; 32 REGINFO **infopp; 33 DB_MPOOL_HASH **hpp; 34{ 35 DB_MPOOL *dbmp; 36 DB_MPOOL_HASH *hp; 37 MPOOL *c_mp, *mp; 38 REGINFO *infop; 39 roff_t mf_offset; 40 u_int32_t bucket, nbuckets, new_bucket, new_nbuckets, region; 41 u_int32_t *regids; 42 int ret; 43 44 dbmp = env->mp_handle; 45 mf_offset = R_OFFSET(dbmp->reginfo, mfp); 46 mp = dbmp->reginfo[0].primary; 47 ret = 0; 48 49 for (;;) { 50 nbuckets = mp->nbuckets; 51 MP_BUCKET(mf_offset, pgno, nbuckets, bucket); 52 53 /* 54 * Once we work out which region we are looking in, we have to 55 * check that we have that region mapped, and that the version 56 * we have matches the ID in the main mpool region. Otherwise 57 * we have to go and map in any regions that don't match and 58 * retry. 59 */ 60 region = NREGION(mp, bucket); 61 regids = R_ADDR(dbmp->reginfo, mp->regids); 62 63 for (;;) { 64 infop = *infopp = &dbmp->reginfo[region]; 65 c_mp = infop->primary; 66 67 /* If we have the correct region mapped, we're done. */ 68 if (c_mp != NULL && regids[region] == infop->id) 69 break; 70 if ((ret = __memp_map_regions(dbmp)) != 0) 71 return (ret); 72 } 73 74 /* If our caller wants the hash bucket, lock it here. */ 75 if (hpp != NULL) { 76 hp = R_ADDR(infop, c_mp->htab); 77 hp = &hp[bucket - region * mp->htab_buckets]; 78 79 MUTEX_LOCK(env, hp->mtx_hash); 80 81 /* 82 * Check that we still have the correct region mapped. 83 */ 84 if (regids[region] != infop->id) { 85 MUTEX_UNLOCK(env, hp->mtx_hash); 86 continue; 87 } 88 89 /* 90 * Now that the bucket is locked, we need to check that 91 * the cache has not been resized while we waited. 92 */ 93 new_nbuckets = mp->nbuckets; 94 if (nbuckets != new_nbuckets) { 95 MP_BUCKET(mf_offset, pgno, new_nbuckets, 96 new_bucket); 97 98 if (new_bucket != bucket) { 99 MUTEX_UNLOCK(env, hp->mtx_hash); 100 continue; 101 } 102 } 103 104 *hpp = hp; 105 } 106 107 break; 108 } 109 110 return (ret); 111} 112 113static int 114__memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket) 115 DB_MPOOL *dbmp; 116 u_int32_t new_nbuckets, old_bucket, new_bucket; 117{ 118 BH *alloc_bhp, *bhp, *current_bhp, *new_bhp, *next_bhp; 119 DB_LSN vlsn; 120 DB_MPOOL_HASH *new_hp, *old_hp; 121 ENV *env; 122 MPOOL *mp, *new_mp, *old_mp; 123 MPOOLFILE *mfp; 124 REGINFO *new_infop, *old_infop; 125 u_int32_t bucket, high_mask, new_region, old_region; 126 int ret; 127 128 env = dbmp->env; 129 mp = dbmp->reginfo[0].primary; 130 new_bhp = NULL; 131 ret = 0; 132 133 MP_MASK(new_nbuckets, high_mask); 134 135 old_region = NREGION(mp, old_bucket); 136 old_infop = &dbmp->reginfo[old_region]; 137 old_mp = old_infop->primary; 138 old_hp = R_ADDR(old_infop, old_mp->htab); 139 old_hp = &old_hp[old_bucket - old_region * mp->htab_buckets]; 140 141 new_region = NREGION(mp, new_bucket); 142 new_infop = &dbmp->reginfo[new_region]; 143 new_mp = new_infop->primary; 144 new_hp = R_ADDR(new_infop, new_mp->htab); 145 new_hp = &new_hp[new_bucket - new_region * mp->htab_buckets]; 146 147 /* 148 * Before merging, we need to check that there are no old buffers left 149 * in the target hash bucket after a previous split. 150 */ 151free_old: 152 MUTEX_LOCK(env, new_hp->mtx_hash); 153 SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) { 154 MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket); 155 156 if (bucket != new_bucket) { 157 /* 158 * There is no way that an old buffer can be locked 159 * after a split, since everyone will look for it in 160 * the new hash bucket. 161 */ 162 DB_ASSERT(env, !F_ISSET(bhp, BH_LOCKED | BH_DIRTY) && 163 bhp->ref == 0); 164 if ((ret = __memp_bhfree(dbmp, 165 new_infop, new_hp, bhp, BH_FREE_FREEMEM)) != 0) { 166 MUTEX_UNLOCK(env, new_hp->mtx_hash); 167 return (ret); 168 } 169 170 /* 171 * The free has modified the list of buffers and 172 * dropped the mutex. We need to start again. 173 */ 174 goto free_old; 175 } 176 } 177 MUTEX_UNLOCK(env, new_hp->mtx_hash); 178 179 /* 180 * Before we begin, make sure that all of the buffers we care about are 181 * not in use and not frozen. We do this because we can't drop the old 182 * hash bucket mutex once we start moving buffers around. 183 */ 184retry: MUTEX_LOCK(env, old_hp->mtx_hash); 185 SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) { 186 MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno), 187 new_nbuckets, high_mask, bucket); 188 189 if (bucket == new_bucket && 190 (F_ISSET(bhp, BH_LOCKED) || bhp->ref != 0)) { 191 MUTEX_UNLOCK(env, old_hp->mtx_hash); 192 __os_yield(env, 0, 0); 193 goto retry; 194 } else if (bucket == new_bucket && F_ISSET(bhp, BH_FROZEN)) { 195 ++bhp->ref; 196 if (BH_OBSOLETE(bhp, old_hp->old_reader, vlsn)) 197 alloc_bhp = NULL; 198 else { 199 mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); 200 MUTEX_UNLOCK(env, old_hp->mtx_hash); 201 if ((ret = __memp_alloc(dbmp, 202 old_infop, mfp, 0, NULL, &alloc_bhp)) != 0) 203 return (ret); 204 MUTEX_LOCK(env, old_hp->mtx_hash); 205 } 206 if ((ret = __memp_bh_thaw(dbmp, 207 old_infop, old_hp, bhp, alloc_bhp)) != 0) { 208 MUTEX_UNLOCK(env, old_hp->mtx_hash); 209 return (ret); 210 } 211 212 /* 213 * We've dropped the mutex in order to thaw, so we need 214 * to go back to the beginning and check that all of 215 * the buffers we care about are still unlocked and 216 * unreferenced. 217 */ 218 MUTEX_UNLOCK(env, old_hp->mtx_hash); 219 goto retry; 220 } 221 } 222 223 /* 224 * We now know that all of the buffers we care about are unlocked and 225 * unreferenced. Go ahead and copy them. 226 */ 227 SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) { 228 MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno), 229 new_nbuckets, high_mask, bucket); 230 mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); 231 232 /* 233 * We ignore buffers that don't hash to the new bucket. We 234 * could also ignore clean buffers which are not part of a 235 * multiversion chain as long as they have a backing file. 236 */ 237 if (bucket != new_bucket || (!F_ISSET(bhp, BH_DIRTY) && 238 SH_CHAIN_SINGLETON(bhp, vc) && !mfp->no_backing_file)) 239 continue; 240 241 for (current_bhp = bhp, next_bhp = NULL; 242 current_bhp != NULL; 243 current_bhp = SH_CHAIN_PREV(current_bhp, vc, __bh), 244 next_bhp = alloc_bhp) { 245 if ((ret = __memp_alloc(dbmp, 246 new_infop, mfp, 0, NULL, &alloc_bhp)) != 0) 247 break; 248 249 alloc_bhp->ref = current_bhp->ref; 250 alloc_bhp->ref_sync = current_bhp->ref_sync; 251 alloc_bhp->priority = current_bhp->priority; 252 alloc_bhp->pgno = current_bhp->pgno; 253 alloc_bhp->mf_offset = current_bhp->mf_offset; 254 alloc_bhp->flags = current_bhp->flags; 255 alloc_bhp->td_off = current_bhp->td_off; 256 257 /* 258 * We've duplicated the buffer, so now we need to 259 * update reference counts, including the counts in the 260 * per-MPOOLFILE and the transaction detail (for MVCC 261 * buffers). 262 */ 263 MUTEX_LOCK(env, mfp->mutex); 264 ++mfp->block_cnt; 265 MUTEX_UNLOCK(env, mfp->mutex); 266 267 if (alloc_bhp->td_off != INVALID_ROFF && 268 (ret = __txn_add_buffer(env, 269 R_ADDR(&env->tx_handle->reginfo, 270 alloc_bhp->td_off))) != 0) 271 break; 272 273 memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize); 274 275 /* 276 * We build up the MVCC chain first, then insert the 277 * head (stored in new_bhp) once. 278 */ 279 if (next_bhp == NULL) { 280 SH_CHAIN_INIT(alloc_bhp, vc); 281 new_bhp = alloc_bhp; 282 } else 283 SH_CHAIN_INSERT_BEFORE( 284 next_bhp, alloc_bhp, vc, __bh); 285 } 286 287 MUTEX_LOCK(env, new_hp->mtx_hash); 288 SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq); 289 if (F_ISSET(new_bhp, BH_DIRTY)) 290 ++new_hp->hash_page_dirty; 291 292 MUTEX_UNLOCK(env, new_hp->mtx_hash); 293 294 if (F_ISSET(bhp, BH_DIRTY)) { 295 F_CLR(bhp, BH_DIRTY); 296 --old_hp->hash_page_dirty; 297 } 298 } 299 300 if (ret == 0) 301 mp->nbuckets = new_nbuckets; 302 MUTEX_UNLOCK(env, old_hp->mtx_hash); 303 304 return (ret); 305} 306 307static int 308__memp_add_bucket(dbmp) 309 DB_MPOOL *dbmp; 310{ 311 ENV *env; 312 MPOOL *mp; 313 u_int32_t high_mask, new_bucket, old_bucket; 314 315 env = dbmp->env; 316 mp = dbmp->reginfo[0].primary; 317 318 new_bucket = mp->nbuckets; 319 /* We should always be adding buckets to the last region. */ 320 DB_ASSERT(env, NREGION(mp, new_bucket) == mp->nreg - 1); 321 MP_MASK(mp->nbuckets, high_mask); 322 old_bucket = new_bucket & (high_mask >> 1); 323 324 /* 325 * With fixed-sized regions, the new region is always smaller than the 326 * existing total cache size, so buffers always need to be copied. If 327 * we implement variable region sizes, it's possible that we will be 328 * splitting a hash bucket in the new region. Catch that here. 329 */ 330 DB_ASSERT(env, NREGION(mp, old_bucket) != NREGION(mp, new_bucket)); 331 332 return (__memp_merge_buckets(dbmp, mp->nbuckets + 1, 333 old_bucket, new_bucket)); 334} 335 336static int 337__memp_add_region(dbmp) 338 DB_MPOOL *dbmp; 339{ 340 ENV *env; 341 MPOOL *mp; 342 REGINFO *infop; 343 int ret; 344 roff_t reg_size; 345 u_int i; 346 u_int32_t *regids; 347 348 env = dbmp->env; 349 mp = dbmp->reginfo[0].primary; 350 /* All cache regions are the same size. */ 351 reg_size = dbmp->reginfo[0].rp->size; 352 ret = 0; 353 354 infop = &dbmp->reginfo[mp->nreg]; 355 infop->env = env; 356 infop->type = REGION_TYPE_MPOOL; 357 infop->id = INVALID_REGION_ID; 358 infop->flags = REGION_CREATE_OK; 359 if ((ret = __env_region_attach(env, infop, reg_size)) != 0) 360 return (ret); 361 if ((ret = __memp_init(env, 362 dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0) 363 return (ret); 364 regids = R_ADDR(dbmp->reginfo, mp->regids); 365 regids[mp->nreg++] = infop->id; 366 367 for (i = 0; i < mp->htab_buckets; i++) 368 if ((ret = __memp_add_bucket(dbmp)) != 0) 369 break; 370 371 return (ret); 372} 373 374static int 375__memp_remove_bucket(dbmp) 376 DB_MPOOL *dbmp; 377{ 378 ENV *env; 379 MPOOL *mp; 380 u_int32_t high_mask, new_bucket, old_bucket; 381 382 env = dbmp->env; 383 mp = dbmp->reginfo[0].primary; 384 385 old_bucket = mp->nbuckets - 1; 386 387 /* We should always be removing buckets from the last region. */ 388 DB_ASSERT(env, NREGION(mp, old_bucket) == mp->nreg - 1); 389 MP_MASK(mp->nbuckets - 1, high_mask); 390 new_bucket = old_bucket & (high_mask >> 1); 391 392 return (__memp_merge_buckets(dbmp, mp->nbuckets - 1, 393 old_bucket, new_bucket)); 394} 395 396static int 397__memp_remove_region(dbmp) 398 DB_MPOOL *dbmp; 399{ 400 ENV *env; 401 MPOOL *mp; 402 REGINFO *infop; 403 int ret; 404 u_int i; 405 406 env = dbmp->env; 407 mp = dbmp->reginfo[0].primary; 408 ret = 0; 409 410 if (mp->nreg == 1) { 411 __db_errx(env, "cannot remove the last cache"); 412 return (EINVAL); 413 } 414 415 for (i = 0; i < mp->htab_buckets; i++) 416 if ((ret = __memp_remove_bucket(dbmp)) != 0) 417 return (ret); 418 419 /* Detach from the region then destroy it. */ 420 infop = &dbmp->reginfo[--mp->nreg]; 421 return (__env_region_detach(env, infop, 1)); 422} 423 424static int 425__memp_map_regions(dbmp) 426 DB_MPOOL *dbmp; 427{ 428 ENV *env; 429 MPOOL *mp; 430 int ret; 431 u_int i; 432 u_int32_t *regids; 433 434 env = dbmp->env; 435 mp = dbmp->reginfo[0].primary; 436 regids = R_ADDR(dbmp->reginfo, mp->regids); 437 ret = 0; 438 439 for (i = 1; i < mp->nreg; ++i) { 440 if (dbmp->reginfo[i].primary != NULL && 441 dbmp->reginfo[i].id == regids[i]) 442 continue; 443 444 if (dbmp->reginfo[i].primary != NULL) 445 ret = __env_region_detach(env, &dbmp->reginfo[i], 0); 446 447 dbmp->reginfo[i].env = env; 448 dbmp->reginfo[i].type = REGION_TYPE_MPOOL; 449 dbmp->reginfo[i].id = regids[i]; 450 dbmp->reginfo[i].flags = REGION_JOIN_OK; 451 if ((ret = 452 __env_region_attach(env, &dbmp->reginfo[i], 0)) != 0) 453 return (ret); 454 dbmp->reginfo[i].primary = R_ADDR(&dbmp->reginfo[i], 455 dbmp->reginfo[i].rp->primary); 456 } 457 458 for (; i < mp->max_nreg; i++) 459 if (dbmp->reginfo[i].primary != NULL && 460 (ret = __env_region_detach(env, 461 &dbmp->reginfo[i], 0)) != 0) 462 break; 463 464 return (ret); 465} 466 467/* 468 * PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t)); 469 */ 470int 471__memp_resize(dbmp, gbytes, bytes) 472 DB_MPOOL *dbmp; 473 u_int32_t gbytes, bytes; 474{ 475 ENV *env; 476 MPOOL *mp; 477 int ret; 478 u_int32_t ncache; 479 roff_t reg_size, total_size; 480 481 env = dbmp->env; 482 mp = dbmp->reginfo[0].primary; 483 reg_size = dbmp->reginfo[0].rp->size; 484 total_size = (roff_t)gbytes * GIGABYTE + bytes; 485 ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size); 486 487 if (ncache < 1) 488 ncache = 1; 489 else if (ncache > mp->max_nreg) { 490 __db_errx(env, 491 "cannot resize to %lu cache regions: maximum is %lu", 492 (u_long)ncache, (u_long)mp->max_nreg); 493 return (EINVAL); 494 } 495 496 ret = 0; 497 MUTEX_LOCK(env, mp->mtx_resize); 498 while (mp->nreg != ncache) 499 if ((ret = (mp->nreg < ncache ? 500 __memp_add_region(dbmp) : 501 __memp_remove_region(dbmp))) != 0) 502 break; 503 MUTEX_UNLOCK(env, mp->mtx_resize); 504 505 return (ret); 506} 507 508/* 509 * PUBLIC: int __memp_get_cache_max __P((DB_ENV *, u_int32_t *, u_int32_t *)); 510 */ 511int 512__memp_get_cache_max(dbenv, max_gbytesp, max_bytesp) 513 DB_ENV *dbenv; 514 u_int32_t *max_gbytesp, *max_bytesp; 515{ 516 DB_MPOOL *dbmp; 517 ENV *env; 518 MPOOL *mp; 519 roff_t reg_size, max_size; 520 521 env = dbenv->env; 522 523 ENV_NOT_CONFIGURED(env, 524 env->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL); 525 526 if (MPOOL_ON(env)) { 527 /* Cannot be set after open, no lock required to read. */ 528 dbmp = env->mp_handle; 529 mp = dbmp->reginfo[0].primary; 530 reg_size = dbmp->reginfo[0].rp->size; 531 max_size = mp->max_nreg * reg_size; 532 *max_gbytesp = (u_int32_t)(max_size / GIGABYTE); 533 *max_bytesp = (u_int32_t)(max_size % GIGABYTE); 534 } else { 535 *max_gbytesp = dbenv->mp_max_gbytes; 536 *max_bytesp = dbenv->mp_max_bytes; 537 } 538 539 return (0); 540} 541 542/* 543 * PUBLIC: int __memp_set_cache_max __P((DB_ENV *, u_int32_t, u_int32_t)); 544 */ 545int 546__memp_set_cache_max(dbenv, max_gbytes, max_bytes) 547 DB_ENV *dbenv; 548 u_int32_t max_gbytes, max_bytes; 549{ 550 ENV *env; 551 552 env = dbenv->env; 553 554 ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_cache_max"); 555 dbenv->mp_max_gbytes = max_gbytes; 556 dbenv->mp_max_bytes = max_bytes; 557 558 return (0); 559} 560